pax_global_header00006660000000000000000000000064151132461200014505gustar00rootroot0000000000000052 comment=3f5119ae6e4166ca7870e5861aa5785d4fd8a372 unicode-segmentation-rs-0.2.0/000077500000000000000000000000001511324612000162475ustar00rootroot00000000000000unicode-segmentation-rs-0.2.0/.github/000077500000000000000000000000001511324612000176075ustar00rootroot00000000000000unicode-segmentation-rs-0.2.0/.github/FUNDING.yml000066400000000000000000000003751511324612000214310ustar00rootroot00000000000000# Copyright © Michal Čihař # # SPDX-License-Identifier: CC0-1.0 # This file is maintained in https://github.com/WeblateOrg/meta/ github: WeblateOrg open_collective: weblate liberapay: Weblate custom: https://weblate.org/donate/ unicode-segmentation-rs-0.2.0/.github/ISSUE_TEMPLATE/000077500000000000000000000000001511324612000217725ustar00rootroot00000000000000unicode-segmentation-rs-0.2.0/.github/ISSUE_TEMPLATE/bug_report.yml000066400000000000000000000050231511324612000246650ustar00rootroot00000000000000# Copyright © Michal Čihař # # SPDX-License-Identifier: CC0-1.0 # This file is maintained in https://github.com/WeblateOrg/meta/ # and generated from .github/ISSUE_TEMPLATE/snippets there. name: Reproducible bug report description: Create a report to help us improve body: - type: markdown attributes: value: | Thank you for reporting an issue. This form guides you in creating a useful issue report. Want your answer quickly and guaranteed? Visit https://weblate.org/support/ to reach our dedicated support team. As a subscriber, you will always have priority and help Weblate growing. - type: textarea id: what-happened attributes: label: Describe the issue description: > A clear and concise description of the problem you are facing. Please include important information, like the file format you are using and installed add-ons. placeholder: Tell us what you see! validations: required: true - type: checkboxes id: tried attributes: label: I already tried description: If you didn’t try already, try searching the documentation and existing issues. options: - label: I've read and searched [the documentation](https://docs.weblate.org/). required: true - label: I've searched for similar filed issues in this repository. required: true - type: textarea id: reproducer attributes: label: Steps to reproduce the behavior description: What did you do before the problem appeared? placeholder: | 1. Go to '…' 2. Scroll down to '…' 3. Click on '…' 4. The error occurs validations: required: true - type: textarea id: expected attributes: label: Expected behavior description: A clear and concise description of what you expected to happen. placeholder: Tell us what you want to see! - type: textarea id: screenshots attributes: label: Screenshots description: If applicable, add screenshots to better explain your problem. - type: textarea id: traceback attributes: label: Exception traceback description: > In case you observed a server error or crash, please read [the debugging documentation](https://docs.weblate.org/en/latest/contributing/debugging.html) for information on obtaining the relevant logs. This field will be rendered as a Python traceback automatically. render: pytb - type: textarea id: additional attributes: label: Additional context description: Add any other contextual info about the problem here. unicode-segmentation-rs-0.2.0/.github/ISSUE_TEMPLATE/config.yml000066400000000000000000000015501511324612000237630ustar00rootroot00000000000000# Copyright © Michal Čihař # # SPDX-License-Identifier: CC0-1.0 # # This file is maintained in https://github.com/WeblateOrg/meta/ # and generated using update-issue-config there. contact_links: - name: Read our clear, thorough and localized docs url: https://docs.weblate.org/ about: Save your time! There is an instant solution for many issues in the docs appreciated by numerous users. And it might be in your preferred language. - name: Get professional support url: https://weblate.org/support/ about: As a subscriber, you will always receive fast and helpful replies from our dedicated support team. More responsible and faster for your business, also makes Weblate stronger. - name: Ask the community url: https://github.com/WeblateOrg/weblate/discussions about: Want to discuss something with a community? Do it in discussions! unicode-segmentation-rs-0.2.0/.github/ISSUE_TEMPLATE/feature_request.yml000066400000000000000000000033141511324612000257210ustar00rootroot00000000000000# Copyright © Michal Čihař # # SPDX-License-Identifier: CC0-1.0 # This file is maintained in https://github.com/WeblateOrg/meta/ # and generated from .github/ISSUE_TEMPLATE/snippets there. name: Feature request description: Suggest an idea for this project body: - type: markdown attributes: value: | Thank you for requesting a change. This form guides you in creating a useful feature request. Want your answer quickly and guaranteed? Visit https://weblate.org/support/ to reach our dedicated support team. As a subscriber, you will always have priority and help Weblate growing. - type: textarea id: describe attributes: label: Describe the problem description: > Is your feature request related to a problem? If so, please provide a clear and concise description of what the problem is. placeholder: I'm always frustrated when… validations: required: true - type: textarea id: solution attributes: label: Solution brainstorm description: We know you have ideas how to address this, please share it. placeholder: I'd like to get… validations: required: true - type: textarea id: alternatives attributes: label: Describe alternatives you have considered description: A clear and concise description of any alternative solutions or features you have considered. placeholder: The issue can also be addressed by… - type: textarea id: screenshots attributes: label: Screenshots description: If applicable, add screenshots to better explain your problem. - type: textarea id: additional attributes: label: Additional context description: Add any other contextual info about the problem here. unicode-segmentation-rs-0.2.0/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000010711511324612000234070ustar00rootroot00000000000000 unicode-segmentation-rs-0.2.0/.github/PULL_REQUEST_TEMPLATE.md.license000066400000000000000000000002231511324612000250260ustar00rootroot00000000000000Copyright © Michal Čihař SPDX-License-Identifier: CC0-1.0 This file is maintained in https://github.com/WeblateOrg/meta/ unicode-segmentation-rs-0.2.0/.github/renovate.json000066400000000000000000000001771511324612000223320ustar00rootroot00000000000000{ "$schema": "https://docs.renovatebot.com/renovate-schema.json", "extends": [ "github>WeblateOrg/meta:renovate" ] } unicode-segmentation-rs-0.2.0/.github/renovate.json.license000066400000000000000000000002231511324612000237430ustar00rootroot00000000000000Copyright © Michal Čihař SPDX-License-Identifier: CC0-1.0 This file is maintained in https://github.com/WeblateOrg/meta/ unicode-segmentation-rs-0.2.0/.github/workflows/000077500000000000000000000000001511324612000216445ustar00rootroot00000000000000unicode-segmentation-rs-0.2.0/.github/workflows/ci.yml000066400000000000000000000243611511324612000227700ustar00rootroot00000000000000# Copyright © Michal Čihař # # SPDX-License-Identifier: MIT name: CI on: push: branches: - main tags: - '*' pull_request: workflow_dispatch: permissions: contents: read jobs: test: runs-on: ${{ matrix.os }} strategy: matrix: os: - ubuntu-latest python-version: - '3.10' - '3.11' - '3.12' - '3.13' - '3.14' include: - os: windows-latest python-version: '3.14' - os: macos-latest python-version: '3.14' - os: ubuntu-24.04-arm python-version: '3.14' name: Tests, Python ${{ matrix.python-version }}, ${{ matrix.os }} steps: - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 with: persist-credentials: false - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 id: setup_python with: python-version: ${{ matrix.python-version }} - uses: astral-sh/setup-uv@1e862dfacbd1d6d858c55d9b792c756523627244 # v7.1.4 # zizmor: ignore[cache-poisoning] # The cache is stored only on main branch and not used for publishing with: save-cache: ${{ github.ref == 'refs/heads/main' }} cache-suffix: ${{ steps.setup_python.outputs.python-version }} version: 0.9.13 activate-environment: 'true' - run: uv sync - run: pytest linux: runs-on: ${{ matrix.platform.runner }} name: Package linux, ${{ matrix.platform.target }}, ${{ matrix.platform.runner }} strategy: matrix: platform: - runner: ubuntu-latest target: x86_64 - runner: ubuntu-latest target: x86 - runner: ubuntu-24.04-arm target: aarch64 - runner: ubuntu-latest target: armv7 - runner: ubuntu-latest target: s390x - runner: ubuntu-latest target: ppc64le steps: - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 with: persist-credentials: false - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: 3.x - name: Build wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist sccache: false manylinux: auto - name: Build free-threaded wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist -i python3.14t sccache: false manylinux: auto - name: Build free-threaded wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist -i python3.13t sccache: false manylinux: auto - name: Upload wheels uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: wheels-linux-${{ matrix.platform.target }} path: dist musllinux: runs-on: ${{ matrix.platform.runner }} name: Package musllinux, ${{ matrix.platform.target }}, ${{ matrix.platform.runner }} strategy: matrix: platform: - runner: ubuntu-latest target: x86_64 - runner: ubuntu-latest target: x86 - runner: ubuntu-latest target: aarch64 - runner: ubuntu-latest target: armv7 steps: - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 with: persist-credentials: false - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: 3.x - name: Build wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist sccache: false manylinux: musllinux_1_2 - name: Build free-threaded wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist -i python3.13t sccache: false manylinux: musllinux_1_2 - name: Build free-threaded wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist -i python3.14t sccache: false manylinux: musllinux_1_2 - name: Upload wheels uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: wheels-musllinux-${{ matrix.platform.target }} path: dist windows: runs-on: ${{ matrix.platform.runner }} name: Package windows, ${{ matrix.platform.target }}, ${{ matrix.platform.runner }} strategy: matrix: platform: - runner: windows-latest target: x64 - runner: windows-latest target: x86 steps: - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 with: persist-credentials: false - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: 3.x architecture: ${{ matrix.platform.target }} - name: Build wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist sccache: false - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: 3.13t architecture: ${{ matrix.platform.target }} - name: Build free-threaded wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist -i python3.13t sccache: false - name: Build free-threaded wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist -i python3.14t sccache: false - name: Upload wheels uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: wheels-windows-${{ matrix.platform.target }} path: dist macos: runs-on: ${{ matrix.platform.runner }} name: Package macos, ${{ matrix.platform.target }}, ${{ matrix.platform.runner }} strategy: matrix: platform: - runner: macos-15-intel target: x86_64 - runner: macos-15 target: aarch64 steps: - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 with: persist-credentials: false - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: 3.x - name: Build wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist sccache: false - name: Build free-threaded wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist -i python3.13t sccache: false - name: Build free-threaded wheels uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: target: ${{ matrix.platform.target }} args: --release --out dist -i python3.14t sccache: false - name: Upload wheels uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: wheels-macos-${{ matrix.platform.target }} path: dist sdist: runs-on: ubuntu-latest name: Package sdist steps: - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 with: persist-credentials: false - name: Build sdist uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: command: sdist args: --out dist - name: Upload sdist uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: name: wheels-sdist path: dist all-tests: needs: - test - linux - musllinux - windows - macos - sdist name: CI passed runs-on: ubuntu-slim steps: - run: echo release: name: Release runs-on: ubuntu-latest if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} needs: all-tests permissions: # Use to sign the release artifacts id-token: write # Used to upload release artifacts contents: write # Used to generate artifact attestation attestations: write steps: - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 with: persist-credentials: false - uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 - name: Generate artifact attestation uses: actions/attest-build-provenance@977bb373ede98d70efdf65b84cb5f73e068dcc2a # v3.0.0 with: subject-path: wheels-*/* - name: Publish to PyPI if: ${{ startsWith(github.ref, 'refs/tags/') }} uses: PyO3/maturin-action@86b9d133d34bc1b40018696f782949dac11bd380 # v1.49.4 with: command: upload args: --non-interactive --skip-existing wheels-*/* release_github: runs-on: ubuntu-latest name: Create release on GitHub permissions: contents: write needs: all-tests if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} steps: - uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 - uses: ncipollo/release-action@b7eabc95ff50cbeeedec83973935c8f306dfcd0b # v1.20.0 with: generateReleaseNotes: true artifacts: wheels-*/* unicode-segmentation-rs-0.2.0/.github/workflows/pre-commit.yml000066400000000000000000000070071511324612000244470ustar00rootroot00000000000000# Copyright © Michal Čihař # # SPDX-License-Identifier: CC0-1.0 # This file is maintained in https://github.com/WeblateOrg/meta/ name: Pre-commit check on: push: branches-ignore: - renovate/** - weblate pull_request: permissions: contents: read jobs: pre-commit: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 with: persist-credentials: false - name: Get cache tag id: get-date run: | echo "cache_tag=$(/bin/date --utc '+%Y%m')" >> "$GITHUB_OUTPUT" echo "previous_cache_tag=$(/bin/date --date='1 month ago' --utc '+%Y%m')" >> "$GITHUB_OUTPUT" shell: bash - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 id: pre-commit-cache with: path: ~/.cache/pre-commit key: ${{ runner.os }}-pre-commit-${{ steps.get-date.outputs.cache_tag }}-${{ hashFiles('.pre-commit-config.yaml') }} restore-keys: | ${{ runner.os }}-pre-commit-${{ steps.get-date.outputs.cache_tag }} ${{ runner.os }}-pre-commit-${{ steps.get-date.outputs.previous_cache_tag }} ${{ runner.os }}-pre-commit- - name: Setup Python uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: '3.14' - uses: astral-sh/setup-uv@1e862dfacbd1d6d858c55d9b792c756523627244 # v7.1.4 with: enable-cache: false - name: detect method id: detect run: | if test -f requirements-lint.txt ; then echo "method=requirements" >> "$GITHUB_OUTPUT" elif test -f pyproject.toml && grep -q dependency-groups pyproject.toml ; then echo "method=pep735" >> "$GITHUB_OUTPUT" elif test -f pyproject.toml && grep -q dependency-groups pyproject.toml ; then echo "method=pyproject" >> "$GITHUB_OUTPUT" else echo "method=uvx" >> "$GITHUB_OUTPUT" fi - name: pre-commit (PEP 735) if: steps.detect.outputs.method == 'pep735' run: | uv run --only-group pre-commit pre-commit run --all --show-diff-on-failure uv run --only-group pre-commit pre-commit gc env: RUFF_OUTPUT_FORMAT: github REUSE_OUTPUT_FORMAT: github GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: pre-commit (uvx) if: steps.detect.outputs.method == 'uvx' run: | uvx pre-commit run --all --show-diff-on-failure uvx pre-commit gc env: RUFF_OUTPUT_FORMAT: github REUSE_OUTPUT_FORMAT: github GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Install dependencies if: steps.detect.outputs.method == 'requirements' run: uv pip install --system -r requirements-lint.txt - name: Install dependencies if: steps.detect.outputs.method == 'pyproject' run: uv pip install --system "$(sed -n 's/.*"\(pre-commit==\([^"]*\)\)".*/\1/p' pyproject.toml)" - name: pre-commit (installed) if: steps.detect.outputs.method == 'requirements' || steps.detect.outputs.method == 'pyproject' run: | pre-commit run --all --show-diff-on-failure pre-commit gc env: RUFF_OUTPUT_FORMAT: github REUSE_OUTPUT_FORMAT: github GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: diff run: git diff if: always() - uses: pre-commit-ci/lite-action@5d6cc0eb514c891a40562a58a8e71576c5c7fb43 # v1.1.0 if: always() with: msg: 'chore(pre-commit): apply code formatting' unicode-segmentation-rs-0.2.0/.gitignore000066400000000000000000000111611511324612000202370ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[codz] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py.cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # UV # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. #uv.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock #poetry.toml # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. # https://pdm-project.org/en/latest/usage/project/#working-with-version-control #pdm.lock #pdm.toml .pdm-python .pdm-build/ # pixi # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. #pixi.lock # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one # in the .venv directory. It is recommended not to include this directory in version control. .pixi # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .envrc .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ # Abstra # Abstra is an AI-powered process automation framework. # Ignore directories containing user credentials, local state, and settings. # Learn more at https://abstra.io/docs .abstra/ # Visual Studio Code # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore # and can be added to the global gitignore or merged into this file. However, if you prefer, # you could uncomment the following to ignore the entire vscode folder # .vscode/ # Ruff stuff: .ruff_cache/ # PyPI configuration file .pypirc # Cursor # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data # refer to https://docs.cursor.com/context/ignore-files .cursorignore .cursorindexingignore # Marimo marimo/_static/ marimo/_lsp/ __marimo__/ # Lock files /Cargo.lock /uv.lock unicode-segmentation-rs-0.2.0/.markdownlint.yml000066400000000000000000000006231511324612000215620ustar00rootroot00000000000000# Copyright © Michal Čihař # # SPDX-License-Identifier: CC0-1.0 # # This file is maintained in https://github.com/WeblateOrg/meta/ # We do not enforce heading first-line-heading: false # Table style is not well aligned with Unicode table-column-style: false # Avoid length limit checks line-length: false # Allow manual markup for branding html: allowed_elements: [a, img] unicode-segmentation-rs-0.2.0/.pre-commit-config.yaml000066400000000000000000000031601511324612000225300ustar00rootroot00000000000000# See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - id: check-toml - id: check-merge-conflict - id: check-json - id: debug-statements - id: mixed-line-ending args: [--fix=lf] - id: pretty-format-json args: [--no-sort-keys, --autofix] - repo: https://github.com/adrienverge/yamllint rev: v1.37.1 hooks: - id: yamllint - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.14.6 hooks: - id: ruff-check args: - --fix - --exit-non-zero-on-fix - id: ruff-format - repo: meta hooks: - id: check-hooks-apply - id: check-useless-excludes - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks rev: v2.15.0 hooks: - id: pretty-format-yaml args: [--autofix, --indent, '2'] - id: pretty-format-toml args: [--autofix] - repo: https://github.com/executablebooks/mdformat rev: 1.0.0 hooks: - id: mdformat additional_dependencies: - mdformat-gfm==1.0.0 - repo: https://github.com/igorshubovych/markdownlint-cli rev: v0.46.0 hooks: - id: markdownlint - repo: https://github.com/fsfe/reuse-tool rev: v6.2.0 hooks: - id: reuse - repo: https://github.com/rhysd/actionlint rev: v1.7.9 hooks: - id: actionlint - repo: https://github.com/zizmorcore/zizmor-pre-commit rev: v1.17.0 hooks: - id: zizmor - repo: https://github.com/gitleaks/gitleaks rev: v8.30.0 hooks: - id: gitleaks ci: autoupdate_schedule: quarterly unicode-segmentation-rs-0.2.0/.yamllint.yml000066400000000000000000000004501511324612000207000ustar00rootroot00000000000000# Copyright © Michal Čihař # # SPDX-License-Identifier: CC0-1.0 # # This file is maintained in https://github.com/WeblateOrg/meta/ # extends: default rules: line-length: max: 500 level: error document-start: disable indentation: indent-sequences: false unicode-segmentation-rs-0.2.0/Cargo.toml000066400000000000000000000010311511324612000201720ustar00rootroot00000000000000[dependencies] pyo3 = {version = "0.27.1", features = ["extension-module", "abi3-py310", "generate-import-lib"]} unicode-segmentation = "1.12.0" unicode-width = "0.2.2" [lib] crate-type = ["cdylib"] name = "unicode_segmentation_rs" [package] authors = ["Michal Čihař "] description = "Unicode segmentation and width for Python using Rust" edition = "2024" license = "MIT" name = "unicode-segmentation-rs" readme = "README.md" repository = "https://github.com/WeblateOrg/unicode-segmentation-rs" version = "0.2.0" unicode-segmentation-rs-0.2.0/LICENSE000066400000000000000000000020571511324612000172600ustar00rootroot00000000000000MIT License Copyright (c) 2025 Michal Čihař Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. unicode-segmentation-rs-0.2.0/LICENSES/000077500000000000000000000000001511324612000174545ustar00rootroot00000000000000unicode-segmentation-rs-0.2.0/LICENSES/CC0-1.0.txt000066400000000000000000000156101511324612000210610ustar00rootroot00000000000000Creative Commons Legal Code CC0 1.0 Universal CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER. Statement of Purpose The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work"). Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others. For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights. 1. Copyright and Related Rights. A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following: i. the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work; ii. moral rights retained by the original author(s) and/or performer(s); iii. publicity and privacy rights pertaining to a person's image or likeness depicted in a Work; iv. rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below; v. rights protecting the extraction, dissemination, use and reuse of data in a Work; vi. database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and vii. other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof. 2. Waiver. To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose. 3. Public License Fallback. Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose. 4. Limitations and Disclaimers. a. No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document. b. Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law. c. Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work. d. Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. unicode-segmentation-rs-0.2.0/LICENSES/MIT.txt000066400000000000000000000020661511324612000206520ustar00rootroot00000000000000MIT License Copyright (c) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. unicode-segmentation-rs-0.2.0/README.md000066400000000000000000000163711511324612000175360ustar00rootroot00000000000000# unicode-segmentation-rs Python bindings for the Rust [unicode-segmentation](https://docs.rs/unicode-segmentation/) and [unicode-width](https://docs.rs/unicode-width/) crates, providing Unicode text segmentation and width calculation according to Unicode standards. ## Features - **Grapheme Cluster Segmentation**: Split text into user-perceived characters - **Word Segmentation**: Split text into words according to Unicode rules - **Sentence Segmentation**: Split text into sentences - **Display Width Calculation**: Get the display width of text (for terminal/monospace display) - **Gettext PO Wrapping**: Wrap text for gettext PO files with proper handling of escape sequences and CJK characters ## Installation ### From PyPI ```bash uv pip install unicode-segmentation-rs ``` ### From source ```bash # Install maturin pip install maturin # Build and install the package maturin develop --release ``` ## Usage ```python import unicode_segmentation_rs # Grapheme clusters (user-perceived characters) text = "Hello 👨‍👩‍👧‍👦 World" clusters = unicode_segmentation_py.graphemes(text, is_extended=True) print(clusters) # ['H', 'e', 'l', 'l', 'o', ' ', '👨‍👩‍👧‍👦', ' ', 'W', 'o', 'r', 'l', 'd'] # Get grapheme clusters with their byte indices indices = unicode_segmentation_py.grapheme_indices(text, is_extended=True) print(indices) # [(0, 'H'), (1, 'e'), ...] # Word boundaries (includes punctuation and whitespace) text = "Hello, world!" words = unicode_segmentation_py.split_word_bounds(text) print(words) # ['Hello', ',', ' ', 'world', '!'] # Unicode words (excludes punctuation and whitespace) words = unicode_segmentation_py.unicode_words(text) print(words) # ['Hello', 'world'] # Word indices indices = unicode_segmentation_py.split_word_bound_indices(text) print(indices) # [(0, 'Hello'), (5, ','), ...] # Sentence segmentation text = "Hello world. How are you? I'm fine." sentences = unicode_segmentation_py.unicode_sentences(text) print(sentences) # ['Hello world. ', 'How are you? ', "I'm fine."] # Display width calculation text = "Hello 世界" width = unicode_segmentation_py.text_width(text) print(width) # 10 (Hello=5, space=1, 世=2, 界=2, but depends on terminal) # Character width print(unicode_segmentation_py.text_width('A')) # Some(1) print(unicode_segmentation_py.text_width('世')) # Some(2) print(unicode_segmentation_py.text_width('\t')) # None (control character) ``` ## Examples ### Grapheme Cluster Segmentation ```python import unicode_segmentation_rs # Complex emojis and combining characters text = "Hello 👨‍👩‍👧‍👦 नमस्ते" print(f"Text: {text}") print(f"Graphemes: {unicode_segmentation_py.graphemes(text, is_extended=True)}") print(f"Length (graphemes): {len(unicode_segmentation_py.graphemes(text, is_extended=True))}") print(f"Length (chars): {len(text)}") # With indices print("Grapheme indices:") for idx, cluster in unicode_segmentation_py.grapheme_indices(text, is_extended=True): print(f" {idx:3d}: {cluster!r}") ``` ### Word Segmentation ```python text = "Hello, world! How are you?" print(f"Text: {text}") print(f"Word bounds: {unicode_segmentation_py.split_word_bounds(text)}") print(f"Unicode words: {unicode_segmentation_py.unicode_words(text)}") # With indices print("Word boundary indices:") for idx, word in unicode_segmentation_py.split_word_bound_indices(text): print(f" {idx:3d}: {word!r}") ``` ### Sentence Segmentation ```python text = "Hello world. How are you? I'm fine, thanks! What about you?" print(f"Text: {text}") sentences = unicode_segmentation_py.unicode_sentences(text) print("Sentences:") for i, sentence in enumerate(sentences, 1): print(f" {i}. {sentence!r}") ``` ### Multilingual Examples ```python # Arabic arabic = "مرحبا بك. كيف حالك؟" print(f"Arabic: {arabic}") print(f"Sentences: {unicode_segmentation_py.unicode_sentences(arabic)}") # Japanese japanese = "こんにちは。お元気ですか?" print(f"Japanese: {japanese}") print(f"Sentences: {unicode_segmentation_py.unicode_sentences(japanese)}") # Mixed languages mixed = "Hello世界! This is a test文章." print(f"Mixed: {mixed}") print(f"Words: {unicode_segmentation_py.unicode_words(mixed)}") ``` ### Display Width Calculation ```python examples = [ "Hello", "世界", "Hello 世界", "こんにちは", "🎉🎊", "Tab\there", ] for text in examples: width = unicode_segmentation_py.text_width(text) width_cjk = unicode_segmentation_py.text_width_cjk(text) print(f"Text: {text!r:20} Width: {width:2} CJK: {width_cjk:2} Chars: {len(text):2}") # Character widths chars = ['a', 'A', '1', ' ', '世', '界', 'あ', '🎉', '\t', '\n'] for c in chars: w = unicode_segmentation_py.text_width(c) w_cjk = unicode_segmentation_py.text_width_cjk(c) w_str = str(w) if w is not None else "None" w_cjk_str = str(w_cjk) if w_cjk is not None else "None" print(f" {c!r:6} width: {w_str:4} cjk: {w_cjk_str:4}") ``` ### Gettext PO File Wrapping ```python # Wrap text for PO files (default width is 77 characters) text = "This is a long translation string that needs to be wrapped appropriately for a gettext PO file" lines = unicode_segmentation_rs.gettext_wrap(text, 77) for i, line in enumerate(lines, 1): print(f"Line {i}: {line}") # Wrapping with CJK characters text = "This translation contains 中文字符 (Chinese characters) and should wrap correctly" lines = unicode_segmentation_rs.gettext_wrap(text, 40) for line in lines: width = unicode_segmentation_rs.text_width(line) print(f"[{width:2d} cols] {line}") # Escape sequences are preserved text = "This has\\nline breaks\\tand tabs" lines = unicode_segmentation_rs.gettext_wrap(text, 20) print(lines) ``` ## API Reference ### `graphemes(text: str, is_extended: bool) -> list[str]` Split a string into grapheme clusters. Set `is_extended=True` for extended grapheme clusters (recommended). ### `grapheme_indices(text: str, is_extended: bool) -> list[tuple[int, str]]` Split a string into grapheme clusters with their byte indices. ### `split_word_bounds(text: str) -> list[str]` Split a string at word boundaries (includes punctuation and whitespace). ### `split_word_bound_indices(text: str) -> list[tuple[int, str]]` Split a string at word boundaries with byte indices. ### `unicode_words(text: str) -> list[str]` Get Unicode words from a string (excludes punctuation and whitespace). ### `unicode_sentences(text: str) -> list[str]` Split a string into sentences according to Unicode rules. ### `text_width(text: str) -> int` Get the display width of a string in columns (as it would appear in a terminal). East Asian characters typically take 2 columns. ### `gettext_wrap(text: str, width: int) -> list[str]` Wrap text for gettext PO files. This function follows gettext's wrapping behavior: - Never breaks escape sequences (`\n`, `\"`, etc.) - Prefers breaking after spaces - Handles CJK characters with proper width calculation - Breaks long words only when necessary ## Building for Distribution ```bash # Build wheel maturin build --release # Build and publish to PyPI maturin publish ``` ## Running Tests ```bash # Install test dependencies pip install pytest # Run tests pytest tests/ ``` ## License This project follows the same license as the underlying unicode-segmentation crate. unicode-segmentation-rs-0.2.0/REUSE.toml000066400000000000000000000012661511324612000200340ustar00rootroot00000000000000SPDX-PackageDownloadLocation = "https://weblate.org/" SPDX-PackageName = "unicode-segmentation-rs" SPDX-PackageSupplier = "Michal Čihař " version = 1 [[annotations]] SPDX-FileCopyrightText = "Michal Čihař " SPDX-License-Identifier = "MIT" path = [".coveragerc", ".dockerignore", ".gitignore", ".isort.cfg", ".pre-commit-config.yaml", "MANIFEST.in", "codecov.yml", "requirements**.txt", "setup.cfg", "pyproject.toml", ".editorconfig", ".well-known/**", "Cargo.toml"] precedence = "aggregate" [[annotations]] SPDX-FileCopyrightText = "Michal Čihař " SPDX-License-Identifier = "MIT" path = ["README.md"] precedence = "aggregate" unicode-segmentation-rs-0.2.0/pyproject.toml000066400000000000000000000025011511324612000211610ustar00rootroot00000000000000[build-system] build-backend = "maturin" requires = ["maturin>=1.10.0,<2.0"] [dependency-groups] build = [ "maturin==1.10.2" ] dev = [ {include-group = "build"}, {include-group = "test"}, {include-group = "pre-commit"} ] pre-commit = [ "pre-commit==4.5.0" ] test = [ "pytest-github-actions-annotate-failures==0.3.0", "pytest==9.0.1" ] [project] classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Topic :: Software Development :: Libraries :: Python Modules" ] description = "Unicode segmentation and width for Python using Rust" dynamic = ["version"] license = "MIT" license-files = ["LICENSE"] name = "unicode-segmentation-rs" readme = "README.md" requires-python = ">=3.10" [[project.authors]] email = "michal@weblate.org" name = "Michal Čihař" [project.urls] Documentation = "https://docs.weblate.org/" Download = "https://github.com/WeblateOrg/unicode-segmentation-rs" Funding = "https://weblate.org/donate/" Homepage = "https://weblate.org/" "Issue Tracker" = "https://github.com/WeblateOrg/unicode-segmentation-rs/issues" "Source Code" = "https://github.com/WeblateOrg/unicode-segmentation-rs" Twitter = "https://twitter.com/WeblateOrg" [tool.maturin] features = ["pyo3/extension-module"] unicode-segmentation-rs-0.2.0/src/000077500000000000000000000000001511324612000170365ustar00rootroot00000000000000unicode-segmentation-rs-0.2.0/src/lib.rs000066400000000000000000000170521511324612000201570ustar00rootroot00000000000000// Copyright © Michal Čihař // // SPDX-License-Identifier: MIT #[pyo3::pymodule(gil_used = false)] mod unicode_segmentation_rs { use pyo3::prelude::*; use unicode_segmentation::UnicodeSegmentation; use unicode_width::UnicodeWidthStr; /// Split a string into grapheme clusters. #[pyfunction] fn graphemes(text: &str, is_extended: bool) -> PyResult> { Ok(text.graphemes(is_extended).map(|s| s.to_string()).collect()) } /// Split a string into grapheme cluster indices #[pyfunction] fn grapheme_indices(text: &str, is_extended: bool) -> PyResult> { Ok(text .grapheme_indices(is_extended) .map(|(i, s)| (i, s.to_string())) .collect()) } /// Split a string at word boundaries (includes punctuation and whitespace). #[pyfunction] fn split_word_bounds(text: &str) -> PyResult> { Ok(text.split_word_bounds().map(|s| s.to_string()).collect()) } /// Split a string at word boundaries with indices. #[pyfunction] fn split_word_bound_indices(text: &str) -> PyResult> { Ok(text .split_word_bound_indices() .map(|(i, s)| (i, s.to_string())) .collect()) } /// Get Unicode words from a string (excludes punctuation and whitespace). #[pyfunction] fn unicode_words(text: &str) -> PyResult> { Ok(text.unicode_words().map(|s| s.to_string()).collect()) } /// Split a string at word boundaries (includes punctuation and whitespace). #[pyfunction] fn unicode_sentences(text: &str) -> PyResult> { Ok(text.unicode_sentences().map(|s| s.to_string()).collect()) } /// Get the display width of a string (as it would appear in a terminal) #[pyfunction] fn text_width(text: &str) -> PyResult { Ok(UnicodeWidthStr::width(text)) } /// Wrap text for gettext PO files /// /// This implementation follows gettext's wrapping behavior: /// - Never breaks escape sequences (\\n, \\", etc.) /// - Prefers breaking after spaces /// - Handles CJK characters with proper width calculation /// - Breaks long words only when necessary #[pyfunction] fn gettext_wrap(text: &str, width: usize) -> PyResult> { if text.is_empty() || width == 0 { return if text.is_empty() { Ok(vec![]) } else { Ok(vec![text.to_string()]) }; } // Split text into chunks at word boundaries let chunks = split_po_chunks(text); // Wrap chunks into lines Ok(wrap_po_chunks(&chunks, width)) } /// Split text into chunks using word boundaries with PO-specific rules fn split_po_chunks(text: &str) -> Vec { let mut chunks: Vec = Vec::new(); let mut last_char: Option = None; let mut second_last_char: Option = None; let mut second_fallback: Option; let mut last_chunk = String::new(); for chunk in text.split_word_bounds() { let mut chunk_str = chunk.to_string(); // Detect escape sequences and emit them if last_char.is_some() && last_char.unwrap() == '\\' && chunk_str.len() > 1 { last_chunk.push(chunk_str.remove(0)); chunks.push(last_chunk.clone()); last_chunk.clear(); } let should_merge = last_char.is_some() && (second_last_char.is_none() || !matches!(last_char.unwrap(), '\\' | 'n') || second_last_char.unwrap() != '\\') && (is_mergeable(&chunk_str) || (!is_open_parenthesis(&chunk_str.chars().next().unwrap()) && !is_line_break(&last_char.unwrap()) && (is_punctuation(&last_char.unwrap()) || (is_punctuation(&chunk_str.chars().next().unwrap()) && !last_char.unwrap().is_whitespace())))); if !should_merge { if !last_chunk.is_empty() { chunks.push(last_chunk.clone()) } last_chunk.clear(); second_fallback = None; } else { second_fallback = Some(last_char.unwrap()); } last_chunk.push_str(chunk_str.as_str()); // Update last_char and second_last_char let chars: Vec = chunk_str.chars().collect(); if chars.len() >= 2 { let len = chars.len(); last_char = Some(chars[len - 2]); second_last_char = Some(chars[len - 1]); } else { second_last_char = second_fallback; last_char = Some(chars[0]); } } if !last_chunk.is_empty() { chunks.push(last_chunk.clone()) } chunks } /// Wrap chunks into lines respecting the width limit fn wrap_po_chunks(chunks: &Vec, width: usize) -> Vec { let mut lines = Vec::new(); let mut current_line = String::new(); let mut current_width = 0; for chunk in chunks { let chunk_width: usize = chunk.width(); if current_width + chunk_width <= width { current_line.push_str(chunk.as_str()); current_width += chunk_width; } else { if !current_line.is_empty() { lines.push(current_line.clone()); current_line.clear(); current_width = 0; } current_line.push_str(chunk.as_str()); current_width += chunk_width; } // Force break on \n if chunk.ends_with("\\n") { lines.push(current_line.clone()); current_line.clear(); current_width = 0; } } if !current_line.is_empty() { lines.push(current_line.clone()); } lines } /// Check if a string contains only mergeable characters #[inline] fn is_mergeable(s: &str) -> bool { s.len() == 1 && matches!( &s.chars().next().unwrap(), '/' | '}' | ')' | '>' | '-' | ' ' | '\t' ) } /// Check if a string starts with an open parenthesis character #[inline] fn is_open_parenthesis(c: &char) -> bool { matches!(c, '{' | '(') } /// Check if a string should trigger line break #[inline] fn is_line_break(c: &char) -> bool { matches!(c, '/' | '}' | ')' | '>' | '-') } /// Check if a string contains punctuation characters #[inline] fn is_punctuation(c: &char) -> bool { matches!( c, '!' | '"' | '#' | '$' | '%' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '[' | '\\' | ']' | '^' | '_' | '`' | '{' | '|' | '}' | '~' ) } } unicode-segmentation-rs-0.2.0/tests/000077500000000000000000000000001511324612000174115ustar00rootroot00000000000000unicode-segmentation-rs-0.2.0/tests/test_gettext_wrap.py000066400000000000000000000317521511324612000235470ustar00rootroot00000000000000# Copyright © Michal Čihař # # SPDX-License-Identifier: MIT """Unit tests for gettext wrapping""" import unicode_segmentation_rs class TestGettextWrap: """Tests for gettext PO file wrapping""" def test_simple_wrap(self): text = "This is a simple test string" result = unicode_segmentation_rs.gettext_wrap(text, 20) assert result == ["This is a simple ", "test string"] def test_wrap_with_cjk(self): text = "Hello 世界 this is a test" result = unicode_segmentation_rs.gettext_wrap(text, 10) assert result == ["Hello 世", "界 this ", "is a test"] def test_wrap_short_text(self): text = "Short" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == ["Short"] def test_wrap_empty_string(self): result = unicode_segmentation_rs.gettext_wrap("", 77) assert result == [] def test_wrap_zero_width(self): text = "Test" result = unicode_segmentation_rs.gettext_wrap(text, 0) assert result == ["Test"] def test_wrap_with_punctuation(self): text = "Hello, world! How are you?" result = unicode_segmentation_rs.gettext_wrap(text, 15) assert result == ["Hello, world! ", "How are you?"] def test_wrap_with_escape_sequences(self): # Escape sequences should not be broken text = "This has \\n escape sequences \\t in it" result = unicode_segmentation_rs.gettext_wrap(text, 11) assert result == ["This has \\n", " escape ", "sequences ", "\\t in it"] def test_wrap_long_word(self): # Long words that don't fit should still be included text = "Supercalifragilisticexpialidocious" result = unicode_segmentation_rs.gettext_wrap(text, 20) assert result == [text] def test_wrap_default_width(self): # Test with typical PO file width (77 characters) text = "This is a longer sentence that should wrap appropriately at the standard gettext width of seventy-seven characters" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "This is a longer sentence that should wrap appropriately at the standard ", "gettext width of seventy-seven characters", ] def test_wrapping_spaces(self): """This tests that we wrap like gettext.""" text = r"bla\t12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [text] def test_wrapping_long_fit(self): text = r"bla\t12345 12345 12345 12345 12345 12 12345 12345 12345 12345 12345 12345 123" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [text] def test_wrapping_long_overflow(self): text = r"bla\t12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 1" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ r"bla\t12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 ", "1", ] def test_wrapping_long_multiline_1(self): text = "bla\t12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 1234\n1234" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "bla\t12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 1234\n", "1234", ] def test_wrapping_long_multiline_2(self): text = r"bla\t12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345\n12345" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ r"bla\t12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 ", r"12345\n", "12345", ] def test_wrapping_long_escapes(self): text = r"\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ r"\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\", r"\\", ] def test_wrapping_cjk(self): text = "効率的なバグの報告はPostGISの開発を助ける本質的な方法です。最も効率的なバグ報告は、PostGIS開発者がそれを再現できるようにすることで、それの引き金となったス" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "効率的なバグの報告はPostGISの開発を助ける本質的な方法です。最も効率的なバグ報", "告は、PostGIS開発者がそれを再現できるようにすることで、それの引き金となったス", ] def test_wrap_emoji(self): text = 'print(ts.string_get_word_breaks("Test ❤️‍🔥 Test")) # Prints [1, 2, 3, 4, 5, ' result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [text] def test_wrap_parenthesis_1(self): text = r"Konvertiert [param what] in [param type] auf die bestmögliche Weise. Der [param type] verwendet die [enum Variant.Type]-Werte.\n" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "Konvertiert [param what] in [param type] auf die bestmögliche Weise. Der ", r"[param type] verwendet die [enum Variant.Type]-Werte.\n", ] def test_wrap_parenthesis_2(self): text = r"- Eine von [Object] abgeleitete Klasse, die in [ClassDB] existiert, z. B. [Node].\n" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "- Eine von [Object] abgeleitete Klasse, die in [ClassDB] existiert, z. B. ", r"[Node].\n", ] def test_wrap_escape_line(self): text = r"%{src?%{dest?转发:入站}:出站} %{ipv6?%{ipv4?IPv4 and IPv6:IPv6}:IPv4}%{proto?, 协议 %{proto#%{next?, }%{item.types?%{item.name}具有类型 %{item.types#%{next?, }%{item}} 的 ICMP:%{item.name}}}}%{mark?, 标记 %{mark.val}}%{dscp?, DSCP %{dscp.inv?%{dscp.val}:%{dscp.val}}}%{helper?, 助手 %{helper.inv?%{helper.val}:%{helper.val}}}" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "%{src?%{dest?转发:入站}:出站} %{ipv6?%{ipv4?IPv4 and IPv6:IPv6}:IPv4}%{proto?, 协议 %{proto#%{next?, }%", r"{item.types?%{item.name}具有类型 %{item.types#%{next?, }%{item}} 的 ", "ICMP:%{item.name}}}}%{mark?, 标记 %{mark.val}}%{dscp?, DSCP %{dscp.inv?%{dscp.val}:%{dscp.val}}}%{helper?, 助手 %{helper.inv?%{helper.val}", r":%{helper.val}}}", ] def test_wrap_parenthesis_long(self): text = r"Must be required by a NotificationListenerService, to ensure that only the system can bind to it. See [url=https://developer.android.com/reference/android/Manifest.permission#BIND_NOTIFICATION_LISTENER_SERVICE]BIND_NOTIFICATION_LISTENER_SERVICE[/url]." result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "Must be required by a NotificationListenerService, to ensure that only the ", "system can bind to it. See [url=https://developer.android.com/reference/", "android/", "Manifest.permission#BIND_NOTIFICATION_LISTENER_SERVICE]BIND_NOTIFICATION_LISTENER_SERVICE[/", "url].", ] def test_wrap_plural_form(self): text = r"Plural-Forms: nplurals=3; plural=n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2;\n" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "Plural-Forms: nplurals=3; plural=n%10==1 && n%100!=11 ? 0 : n%10>=2 && ", r"n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2;\n", ] def test_wrap_url(self): text = r"Language-Team: Ukrainian \n" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "Language-Team: Ukrainian \n", ] def test_wrap_escape(self): text = r"x: to be continued with \"do not loop\", \"loop in current folder\", and \"loop in all folders\".\nWhen trying to find unread messages:" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ r"x: to be continued with \"do not loop\", \"loop in current folder\", and ", r"\"loop in all folders\".\n", "When trying to find unread messages:", ] def test_wrap_label(self): text = r"You can get a copy of your Recovery Key by going to &syncBrand.shortName.label; Options on your other device, and selecting \"My Recovery Key\" under \"Manage Account\"." result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "You can get a copy of your Recovery Key by going to ", "&syncBrand.shortName.label; Options on your other device, and selecting ", r"\"My Recovery Key\" under \"Manage Account\".", ] def test_wrap_wide_stop(self): text = "在 Mastodon 上关注 [@beeware@fosstodon.org](https://fosstodon.org/@beeware),或[加入 BeeWare 爱好者邮件列表](/zh_CN/community/keep-informed/)以获取与项目相关的更新、提示、技巧和公告。" result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "在 Mastodon 上关注 [@beeware@fosstodon.org](https://fosstodon.org/@beeware),", "或[加入 BeeWare 爱好者邮件列表](/zh_CN/community/keep-informed/)以获取与项目", "相关的更新、提示、技巧和公告。", ] def test_wrap_escape_en(self): text = r"By default, \":doc:`Wire Transfer `\" is the only payment provider activated, but you still have to fill out the payment details." result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ r"By default, \":doc:`Wire Transfer `\" is the only payment provider activated, but you still have ", "to fill out the payment details.", ] def test_wrap_escape_localized(self): text = r"기본값으로 \":doc:`온라인 이체 `\"만 결제대행업체을 사용하도록 설정되어 있으나, 여기에도 결제 세부 정보를 입력해야 합니다." result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ r"기본값으로 \":doc:`온라인 이체 `\"만 결제대행업체을 사용하도록 설정되어 있으나, 여기에도 결제 ", r"세부 정보를 입력해야 합니다.", ] def test_wrap_limit(self): text = r"Ukuba uyayiqonda into eyenzekayo, \nungaxelela i-&brandShortName; ukuba iqalise ukuthemba ufaniso lwale sayithi. \nNokuba uyayithemba isayithi, le mposiso isenokuthetha ukuba kukho umntu \nobhucabhuca ukudibanisa kwakho." result = unicode_segmentation_rs.gettext_wrap(text, 77) assert result == [ "Ukuba uyayiqonda into eyenzekayo, \\n", "ungaxelela i-&brandShortName; ukuba iqalise ukuthemba ufaniso lwale sayithi. ", "\\n", "Nokuba uyayithemba isayithi, le mposiso isenokuthetha ukuba kukho umntu ", "\\n", "obhucabhuca ukudibanisa kwakho.", ] unicode-segmentation-rs-0.2.0/tests/test_unicode_segmentation.py000066400000000000000000000161171511324612000252330ustar00rootroot00000000000000# Copyright © Michal Čihař # # SPDX-License-Identifier: MIT """Unit tests for unicode-segmentation-rs""" import unicode_segmentation_rs class TestGraphemes: """Tests for grapheme cluster segmentation""" def test_simple_ascii(self): text = "Hello" result = unicode_segmentation_rs.graphemes(text, is_extended=True) assert result == ["H", "e", "l", "l", "o"] def test_emoji_zwj_sequence(self): # Family emoji with ZWJ (Zero Width Joiner) text = "👨‍👩‍👧‍👦" result = unicode_segmentation_rs.graphemes(text, is_extended=True) assert result == [text] def test_combining_characters(self): # Devanagari with combining characters text = "नमस्ते" result = unicode_segmentation_rs.graphemes(text, is_extended=True) # Should treat combining characters as single graphemes assert len(result) < len(text) def test_grapheme_indices(self): text = "Hello" result = unicode_segmentation_rs.grapheme_indices(text, is_extended=True) expected = [(0, "H"), (1, "e"), (2, "l"), (3, "l"), (4, "o")] assert result == expected def test_empty_string(self): result = unicode_segmentation_rs.graphemes("", is_extended=True) assert result == [] class TestWordSegmentation: """Tests for word segmentation""" def test_split_word_bounds_simple(self): text = "Hello world" result = unicode_segmentation_rs.split_word_bounds(text) assert result == ["Hello", " ", "world"] def test_split_word_bounds_punctuation(self): text = "Hello, world!" result = unicode_segmentation_rs.split_word_bounds(text) assert result == ["Hello", ",", " ", "world", "!"] def test_unicode_words(self): text = "Hello, world!" result = unicode_segmentation_rs.unicode_words(text) assert result == ["Hello", "world"] def test_split_word_bound_indices(self): text = "Hello world" result = unicode_segmentation_rs.split_word_bound_indices(text) expected = [(0, "Hello"), (5, " "), (6, "world")] assert result == expected def test_empty_string(self): result = unicode_segmentation_rs.unicode_words("") assert result == [] def test_multilingual(self): text = "Hello世界" result = unicode_segmentation_rs.unicode_words(text) # Should handle mixed scripts assert len(result) > 0 class TestSentenceSegmentation: """Tests for sentence segmentation""" def test_simple_sentences(self): text = "Hello world. How are you?" result = unicode_segmentation_rs.unicode_sentences(text) assert len(result) == 2 assert result[0] == "Hello world. " assert result[1] == "How are you?" def test_multiple_sentences(self): text = "First. Second! Third?" result = unicode_segmentation_rs.unicode_sentences(text) assert len(result) == 3 def test_abbreviations(self): text = "Dr. Smith went home." result = unicode_segmentation_rs.unicode_sentences(text) # Should handle abbreviations correctly assert len(result) >= 1 def test_empty_string(self): result = unicode_segmentation_rs.unicode_sentences("") assert result == [] def test_arabic(self): text = "مرحبا بك. كيف حالك؟" result = unicode_segmentation_rs.unicode_sentences(text) assert len(result) == 2 def test_japanese(self): text = "こんにちは。お元気ですか?" result = unicode_segmentation_rs.unicode_sentences(text) assert len(result) == 2 class TestDisplayWidth: """Tests for display width calculation""" def test_ascii_width(self): assert unicode_segmentation_rs.text_width("Hello") == 5 assert unicode_segmentation_rs.text_width("a") == 1 def test_cjk_width(self): # Chinese characters are typically 2 columns wide assert unicode_segmentation_rs.text_width("世界") == 4 assert unicode_segmentation_rs.text_width("世") == 2 def test_mixed_width(self): text = "Hello 世界" width = unicode_segmentation_rs.text_width(text) # "Hello" = 5, space = 1, "世界" = 4 assert width == 10 def test_empty_string(self): assert unicode_segmentation_rs.text_width("") == 0 def test_text_width_ascii(self): assert unicode_segmentation_rs.text_width("a") == 1 assert unicode_segmentation_rs.text_width("A") == 1 assert unicode_segmentation_rs.text_width("1") == 1 assert unicode_segmentation_rs.text_width(" ") == 1 def test_text_width(self): assert unicode_segmentation_rs.text_width("世") == 2 assert unicode_segmentation_rs.text_width("界") == 2 assert unicode_segmentation_rs.text_width("あ") == 2 def test_text_width_control(self): # Control characters should return None assert unicode_segmentation_rs.text_width("\t") == 1 assert unicode_segmentation_rs.text_width("\n") == 1 assert unicode_segmentation_rs.text_width("\r") == 1 def test_text_width_mode(self): # Basic test that CJK mode works assert unicode_segmentation_rs.text_width("a") == 1 assert unicode_segmentation_rs.text_width("世") == 2 assert unicode_segmentation_rs.text_width("\t") == 1 class TestEdgeCases: """Tests for edge cases and special scenarios""" def test_only_whitespace(self): text = " " assert unicode_segmentation_rs.graphemes(text, is_extended=True) == [ " ", " ", " ", ] assert unicode_segmentation_rs.split_word_bounds(text) == [" "] assert unicode_segmentation_rs.unicode_words(text) == [] def test_only_punctuation(self): text = "!!!" assert unicode_segmentation_rs.unicode_words(text) == [] assert unicode_segmentation_rs.split_word_bounds(text) == ["!", "!", "!"] def test_newlines(self): text = "Hello\nWorld" result = unicode_segmentation_rs.unicode_words(text) assert "Hello" in result assert "World" in result def test_tabs(self): text = "Hello\tWorld" result = unicode_segmentation_rs.unicode_words(text) assert "Hello" in result assert "World" in result def test_multiple_spaces(self): text = "Hello World" words = unicode_segmentation_rs.unicode_words(text) assert words == ["Hello", "World"] class TestPerformance: """Basic performance sanity checks""" def test_large_text_graphemes(self): text = "a" * 10000 result = unicode_segmentation_rs.graphemes(text, is_extended=True) assert len(result) == 10000 def test_large_text_words(self): text = " ".join(["word"] * 1000) result = unicode_segmentation_rs.unicode_words(text) assert len(result) == 1000 def test_large_text_width(self): text = "a" * 10000 width = unicode_segmentation_rs.text_width(text) assert width == 10000 unicode-segmentation-rs-0.2.0/unicode_segmentation_rs.pyi000066400000000000000000000012251511324612000237010ustar00rootroot00000000000000# Copyright © Michal Čihař # # SPDX-License-Identifier: MIT """ Type stubs for unicode-segmentation-rs This module provides Unicode text segmentation and width calculation. """ def graphemes(text: str, is_extended: bool) -> list[str]: ... def grapheme_indices(text: str, is_extended: bool) -> list[tuple[int, str]]: ... def split_word_bounds(text: str) -> list[str]: ... def split_word_bound_indices(text: str) -> list[tuple[int, str]]: ... def unicode_words(text: str) -> list[str]: ... def unicode_sentences(text: str) -> list[str]: ... def text_width(text: str) -> int: ... def gettext_wrap(text: str, width: int) -> list[str]: ...