pax_global_header00006660000000000000000000000064141362503370014516gustar00rootroot0000000000000052 comment=2c407d60238abc01395a8bd9cf0ebf5bf8b1287b textdistance-4.2.2/000077500000000000000000000000001413625033700142225ustar00rootroot00000000000000textdistance-4.2.2/.drone.star000066400000000000000000000031121413625033700162770ustar00rootroot00000000000000def main(ctx): return dict( kind="pipeline", type="docker", name="default", trigger=dict(branch="master"), steps=[ dict( name="install task", image="alpine:latest", commands=[ "apk add --no-cache wget", "wget https://taskfile.dev/install.sh", "sh install.sh -- latest", "rm install.sh", ], ), step(env="pytest-pure", python="3.6"), step(env="pytest-pure", python="3.7"), step(env="pytest-pure", python="3.8"), step(env="pytest-pure", python="3.9"), step(env="pytest-external", python="3.6"), step(env="pytest-external", python="3.7"), step(env="pytest-external", python="3.8"), step(env="pytest-external", python="3.9"), step(env="flake8", python="3.7"), ], ) def step(env, python): result = dict( name="{} (py{})".format(env, python), image="python:{}-alpine".format(python), depends_on=["install task"], environment=dict( # set coverage database file name to avoid conflicts between steps COVERAGE_FILE=".coverage.{}.{}".format(env, python), ), commands=[ "apk add curl git gcc libc-dev", "./bin/task PYTHON_BIN=python3 VENVS=/opt/py{python}/ -f {env}:run".format( python=python, env=env, ), ], ) return result textdistance-4.2.2/.gitignore000066400000000000000000000002011413625033700162030ustar00rootroot00000000000000*.egg-info __pycache__ *.pyc build/ dist/ .tox/ licenses_example/choosealicense.com .hypothesis/ .coverage htmlcov .task/ venvs/ textdistance-4.2.2/LICENSE000066400000000000000000000020311413625033700152230ustar00rootroot00000000000000Copyright 2018 @orsinium Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. textdistance-4.2.2/MANIFEST.in000066400000000000000000000000431413625033700157550ustar00rootroot00000000000000include README.md include LICENSE textdistance-4.2.2/README.md000077500000000000000000000327071413625033700155150ustar00rootroot00000000000000# TextDistance ![TextDistance logo](logo.png) [![Build Status](https://travis-ci.org/life4/textdistance.svg?branch=master)](https://travis-ci.org/life4/textdistance) [![PyPI version](https://img.shields.io/pypi/v/textdistance.svg)](https://pypi.python.org/pypi/textdistance) [![Status](https://img.shields.io/pypi/status/textdistance.svg)](https://pypi.python.org/pypi/textdistance) [![License](https://img.shields.io/pypi/l/textdistance.svg)](LICENSE) **TextDistance** -- python library for comparing distance between two or more sequences by many algorithms. Features: - 30+ algorithms - Pure python implementation - Simple usage - More than two sequences comparing - Some algorithms have more than one implementation in one class. - Optional numpy usage for maximum speed. ## Algorithms ### Edit based | Algorithm | Class | Functions | |-------------------------------------------------------------------------------------------|----------------------|------------------------| | [Hamming](https://en.wikipedia.org/wiki/Hamming_distance) | `Hamming` | `hamming` | | [MLIPNS](http://www.sial.iias.spb.su/files/386-386-1-PB.pdf) | `Mlipns` | `mlipns` | | [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) | `Levenshtein` | `levenshtein` | | [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) | `DamerauLevenshtein` | `damerau_levenshtein` | | [Jaro-Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) | `JaroWinkler` | `jaro_winkler`, `jaro` | | [Strcmp95](http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c) | `StrCmp95` | `strcmp95` | | [Needleman-Wunsch](https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm) | `NeedlemanWunsch` | `needleman_wunsch` | | [Gotoh](http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/LOA/Lec6-Sequence-Alignment-Affine-Gaps-Gotoh1982.pdf) | `Gotoh` | `gotoh` | | [Smith-Waterman](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) | `SmithWaterman` | `smith_waterman` | ### Token based | Algorithm | Class | Functions | |-------------------------------------------------------------------------------------------|----------------------|---------------| | [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) | `Jaccard` | `jaccard` | | [Sørensen–Dice coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) | `Sorensen` | `sorensen`, `sorensen_dice`, `dice` | | [Tversky index](https://en.wikipedia.org/wiki/Tversky_index) | `Tversky` | `tversky` | | [Overlap coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient) | `Overlap` | `overlap` | | [Tanimoto distance](https://en.wikipedia.org/wiki/Jaccard_index#Tanimoto_similarity_and_distance) | `Tanimoto` | `tanimoto` | | [Cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) | `Cosine` | `cosine` | | [Monge-Elkan](https://www.academia.edu/200314/Generalized_Monge-Elkan_Method_for_Approximate_Text_String_Comparison) | `MongeElkan` | `monge_elkan` | | [Bag distance](https://github.com/Yomguithereal/talisman/blob/master/src/metrics/bag.js) | `Bag` | `bag` | ### Sequence based | Algorithm | Class | Functions | |-----------|-------|-----------| | [longest common subsequence similarity](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) | `LCSSeq` | `lcsseq` | | [longest common substring similarity](https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher) | `LCSStr` | `lcsstr` | | [Ratcliff-Obershelp similarity](https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching) | `RatcliffObershelp` | `ratcliff_obershelp` | ### Compression based [Normalized compression distance](https://en.wikipedia.org/wiki/Normalized_compression_distance#Normalized_compression_distance) with different compression algorithms. Classic compression algorithms: | Algorithm | Class | Function | |----------------------------------------------------------------------------|-------------|--------------| | [Arithmetic coding](https://en.wikipedia.org/wiki/Arithmetic_coding) | `ArithNCD` | `arith_ncd` | | [RLE](https://en.wikipedia.org/wiki/Run-length_encoding) | `RLENCD` | `rle_ncd` | | [BWT RLE](https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform) | `BWTRLENCD` | `bwtrle_ncd` | Normal compression algorithms: | Algorithm | Class | Function | |----------------------------------------------------------------------------|--------------|---------------| | Square Root | `SqrtNCD` | `sqrt_ncd` | | [Entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)) | `EntropyNCD` | `entropy_ncd` | Work in progress algorithms that compare two strings as array of bits: | Algorithm | Class | Function | |--------------------------------------------|-----------|------------| | [BZ2](https://en.wikipedia.org/wiki/Bzip2) | `BZ2NCD` | `bz2_ncd` | | [LZMA](https://en.wikipedia.org/wiki/LZMA) | `LZMANCD` | `lzma_ncd` | | [ZLib](https://en.wikipedia.org/wiki/Zlib) | `ZLIBNCD` | `zlib_ncd` | See [blog post](https://articles.life4web.ru/other/ncd/) for more details about NCD. ### Phonetic | Algorithm | Class | Functions | |------------------------------------------------------------------------------|----------|-----------| | [MRA](https://en.wikipedia.org/wiki/Match_rating_approach) | `MRA` | `mra` | | [Editex](https://anhaidgroup.github.io/py_stringmatching/v0.3.x/Editex.html) | `Editex` | `editex` | ### Simple | Algorithm | Class | Functions | |---------------------|------------|------------| | Prefix similarity | `Prefix` | `prefix` | | Postfix similarity | `Postfix` | `postfix` | | Length distance | `Length` | `length` | | Identity similarity | `Identity` | `identity` | | Matrix similarity | `Matrix` | `matrix` | ## Installation ### Stable Only pure python implementation: ```bash pip install textdistance ``` With extra libraries for maximum speed: ```bash pip install "textdistance[extras]" ``` With all libraries (required for [benchmarking](#benchmarks) and [testing](#running-tests)): ```bash pip install "textdistance[benchmark]" ``` With algorithm specific extras: ```bash pip install "textdistance[Hamming]" ``` Algorithms with available extras: `DamerauLevenshtein`, `Hamming`, `Jaro`, `JaroWinkler`, `Levenshtein`. ### Dev Via pip: ```bash pip install -e git+https://github.com/life4/textdistance.git#egg=textdistance ``` Or clone repo and install with some extras: ```bash git clone https://github.com/life4/textdistance.git pip install -e ".[benchmark]" ``` ## Usage All algorithms have 2 interfaces: 1. Class with algorithm-specific params for customizing. 2. Class instance with default params for quick and simple usage. All algorithms have some common methods: 1. `.distance(*sequences)` -- calculate distance between sequences. 2. `.similarity(*sequences)` -- calculate similarity for sequences. 3. `.maximum(*sequences)` -- maximum possible value for distance and similarity. For any sequence: `distance + similarity == maximum`. 4. `.normalized_distance(*sequences)` -- normalized distance between sequences. The return value is a float between 0 and 1, where 0 means equal, and 1 totally different. 5. `.normalized_similarity(*sequences)` -- normalized similarity for sequences. The return value is a float between 0 and 1, where 0 means totally different, and 1 equal. Most common init arguments: 1. `qval` -- q-value for split sequences into q-grams. Possible values: - 1 (default) -- compare sequences by chars. - 2 or more -- transform sequences to q-grams. - None -- split sequences by words. 2. `as_set` -- for token-based algorithms: - True -- `t` and `ttt` is equal. - False (default) -- `t` and `ttt` is different. ## Examples For example, [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance): ```python import textdistance textdistance.hamming('test', 'text') # 1 textdistance.hamming.distance('test', 'text') # 1 textdistance.hamming.similarity('test', 'text') # 3 textdistance.hamming.normalized_distance('test', 'text') # 0.25 textdistance.hamming.normalized_similarity('test', 'text') # 0.75 textdistance.Hamming(qval=2).distance('test', 'text') # 2 ``` Any other algorithms have same interface. ## Articles A few articles with examples how to use textdistance in the real world: - [Guide to Fuzzy Matching with Python](http://theautomatic.net/2019/11/13/guide-to-fuzzy-matching-with-python/) - [String similarity — the basic know your algorithms guide!](https://itnext.io/string-similarity-the-basic-know-your-algorithms-guide-3de3d7346227) - [Normalized compression distance](https://articles.life4web.ru/other/ncd/) ## Extra libraries For main algorithms textdistance try to call known external libraries (fastest first) if available (installed in your system) and possible (this implementation can compare this type of sequences). [Install](#installation) textdistance with extras for this feature. You can disable this by passing `external=False` argument on init: ```python3 import textdistance hamming = textdistance.Hamming(external=False) hamming('text', 'testit') # 3 ``` Supported libraries: 1. [abydos](https://github.com/chrislit/abydos) 1. [Distance](https://github.com/doukremt/distance) 1. [jellyfish](https://github.com/jamesturk/jellyfish) 1. [py_stringmatching](https://github.com/anhaidgroup/py_stringmatching) 1. [pylev](https://github.com/toastdriven/pylev) 1. [python-Levenshtein](https://github.com/ztane/python-Levenshtein) 1. [pyxDamerauLevenshtein](https://github.com/gfairchild/pyxDamerauLevenshtein) Algorithms: 1. DamerauLevenshtein 1. Hamming 1. Jaro 1. JaroWinkler 1. Levenshtein ## Benchmarks Without extras installation: | algorithm | library | function | time | |-----------|---------|----------|------| | DamerauLevenshtein | jellyfish | damerau_levenshtein_distance | 0.00965294 | | DamerauLevenshtein | pyxdameraulevenshtein | damerau_levenshtein_distance | 0.151378 | | DamerauLevenshtein | pylev | damerau_levenshtein | 0.766461 | | DamerauLevenshtein | **textdistance** | DamerauLevenshtein | 4.13463 | | DamerauLevenshtein | abydos | damerau_levenshtein | 4.3831 | | Hamming | Levenshtein | hamming | 0.0014428 | | Hamming | jellyfish | hamming_distance | 0.00240262 | | Hamming | distance | hamming | 0.036253 | | Hamming | abydos | hamming | 0.0383933 | | Hamming | **textdistance** | Hamming | 0.176781 | | Jaro | Levenshtein | jaro | 0.00313561 | | Jaro | jellyfish | jaro_distance | 0.0051885 | | Jaro | py_stringmatching | jaro | 0.180628 | | Jaro | **textdistance** | Jaro | 0.278917 | | JaroWinkler | Levenshtein | jaro_winkler | 0.00319735 | | JaroWinkler | jellyfish | jaro_winkler | 0.00540443 | | JaroWinkler | **textdistance** | JaroWinkler | 0.289626 | | Levenshtein | Levenshtein | distance | 0.00414404 | | Levenshtein | jellyfish | levenshtein_distance | 0.00601647 | | Levenshtein | py_stringmatching | levenshtein | 0.252901 | | Levenshtein | pylev | levenshtein | 0.569182 | | Levenshtein | distance | levenshtein | 1.15726 | | Levenshtein | abydos | levenshtein | 3.68451 | | Levenshtein | **textdistance** | Levenshtein | 8.63674 | Total: 24 libs. Yeah, so slow. Use TextDistance on production only with extras. Textdistance use benchmark's results for algorithm's optimization and try to call fastest external lib first (if possible). You can run benchmark manually on your system: ```bash pip install textdistance[benchmark] python3 -m textdistance.benchmark ``` TextDistance show benchmarks results table for your system and save libraries priorities into `libraries.json` file in TextDistance's folder. This file will be used by textdistance for calling fastest algorithm implementation. Default [libraries.json](textdistance/libraries.json) already included in package. ## Running tests All you need is [task](https://taskfile.dev/). See [Taskfile.yml](./Taskfile.yml) for the list of available commands. For example, to run tests including third-party libraries usage, execute `task pytest-external:run`. ## Contributing PRs are welcome! - Found a bug? Fix it! - Want to add more algorithms? Sure! Just make it with the same interface as other algorithms in the lib and add some tests. - Can make something faster? Great! Just avoid external dependencies and remember that everything should work not only with strings. - Something else that do you think is good? Do it! Just make sure that CI passes and everything from the README is still applicable (interface, features, and so on). - Have no time to code? Tell your friends and subscribers about `textdistance`. More users, more contributions, more amazing features. Thank you :heart: textdistance-4.2.2/Taskfile.yml000066400000000000000000000041411413625033700165070ustar00rootroot00000000000000# https://taskfile.dev/ version: "3" vars: PYTHON_BIN: python3.7 VENVS: ./venvs/ FLAKE8_ENV: "{{.VENVS}}flake8" PYTEST_PURE_ENV: "{{.VENVS}}pytest-pure" PYTEST_EXT_ENV: "{{.VENVS}}pytest-ext" ISORT_ENV: "{{.VENVS}}isort" TWINE_ENV: "{{.VENVS}}twine" TESTS_PATH: tests/ tasks: venv:create: status: - "test -f {{.ENV}}/bin/activate" cmds: - "{{.PYTHON_BIN}} -m venv {{.ENV}}" - "{{.ENV}}/bin/python3 -m pip install -U pip setuptools wheel" pip:install: sources: - pyproject.toml - "{{.ENV}}/bin/activate" deps: - task: venv:create vars: ENV: "{{.ENV}}" cmds: - "{{.ENV}}/bin/pip install '.[{{.EXTRA}}]'" twine:install: deps: - task: venv:create vars: ENV: "{{.TWINE_ENV}}" cmds: - "{{.TWINE_ENV}}/bin/pip install twine" twine:build: deps: - twine:install cmds: - rm -rf dist/ - "{{.TWINE_ENV}}/bin/python3 setup.py sdist bdist_wheel" twine:release: deps: - twine:build cmds: - "{{.TWINE_ENV}}/bin/twine upload dist/textdistance-*" flake8:install: status: - "test -f {{.FLAKE8_ENV}}/bin/flake8" deps: - task: venv:create vars: ENV: "{{.FLAKE8_ENV}}" cmds: - "{{.FLAKE8_ENV}}/bin/python3 -m pip install -r requirements-flake.txt" flake8:run: sources: - "**/*.py" deps: - flake8:install cmds: - "{{.FLAKE8_ENV}}/bin/flake8 ." pytest-pure:run: deps: - task: pip:install vars: ENV: "{{.PYTEST_PURE_ENV}}" EXTRA: test cmds: - "{{.PYTEST_PURE_ENV}}/bin/pytest -m 'not external' {{.ARGS}} {{.TESTS_PATH}}" pytest-external:run: deps: - task: pip:install vars: ENV: "{{.PYTEST_EXT_ENV}}" EXTRA: test,benchmark cmds: - "{{.PYTEST_EXT_ENV}}/bin/pytest {{.ARGS}} {{.TESTS_PATH}}" isort:run: sources: - "**/*.py" deps: - task: pip:install vars: ENV: "{{.ISORT_ENV}}" EXTRA: tests cmds: - "{{.ISORT_ENV}}/bin/isort ." textdistance-4.2.2/constraints.txt000066400000000000000000000007531413625033700173370ustar00rootroot00000000000000abydos # https://github.com/chrislit/abydos distance # https://github.com/doukremt/distance jellyfish # https://github.com/jamesturk/jellyfish numpy py_stringmatching # https://github.com/anhaidgroup/py_stringmatching pylev # https://github.com/toastdriven/pylev python-Levenshtein # https://github.com/ztane/python-Levenshtein pyxDamerauLevenshtein # https://github.com/gfairchild/pyxDamerauLevenshtein tabulate textdistance-4.2.2/deploy.sh000077500000000000000000000001741413625033700160570ustar00rootroot00000000000000set -e pandoc --from=markdown --to=rst --output=README.rst README.md python3 setup.py sdist bdist_wheel twine upload dist/* textdistance-4.2.2/licenses_example/000077500000000000000000000000001413625033700175425ustar00rootroot00000000000000textdistance-4.2.2/licenses_example/README.md000066400000000000000000000002471413625033700210240ustar00rootroot00000000000000# Licenses Compare text of different OSS licenses by `EntropyNCD`. See [blog post](https://articles.life4web.ru/eng/ncd/#most-similar-licenses) for more information. textdistance-4.2.2/licenses_example/compare.py000066400000000000000000000014741413625033700215500ustar00rootroot00000000000000# built-in from itertools import islice from pathlib import Path from sys import argv # project from textdistance import EntropyNCD # read files licenses = dict() for path in Path('choosealicense.com', '_licenses').iterdir(): licenses[path.stem] = path.read_text() # show licenses list if no arguments passed if len(argv) == 1: print(*sorted(licenses.keys()), sep='\n') exit(1) # compare all with one qval = int(argv[1]) if argv[1] else None compare_with = argv[2] distances = dict() for name, content in licenses.items(): distances[name] = EntropyNCD(qval=qval)( licenses[compare_with], content, ) # show 5 most similar sorted_distances = sorted(distances.items(), key=lambda d: d[1]) for name, distance in islice(sorted_distances, 5): print('{:20} {:.4f}'.format(name, distance)) textdistance-4.2.2/licenses_example/make_heatmap.ipynb000066400000000000000000000040351413625033700232230ustar00rootroot00000000000000{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from itertools import islice\n", "from pathlib import Path\n", "from sys import argv\n", "from textdistance import EntropyNCD\n", "\n", "# read files\n", "licenses = dict()\n", "for path in Path('choosealicense.com', '_licenses').iterdir():\n", " licenses[path.stem] = path.read_text()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "distances = []\n", "for name1, content1 in licenses.items():\n", " for name2, content2 in licenses.items():\n", " distances.append((name1, name2, EntropyNCD(qval=None)(content1, content2)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import plotnine as gg\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(distances, columns=['name1', 'name2', 'distance'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "(\n", " gg.ggplot(df)\n", " + gg.geom_tile(gg.aes(x='name1', y='name2', fill='distance'))\n", " + gg.scale_fill_continuous(palette=lambda *args: gg.scale_fill_continuous().palette(*args)[::-1])\n", " + gg.theme(\n", " figure_size=(12, 8), \n", " axis_text_x=gg.element_text(angle=90),\n", " )\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 } textdistance-4.2.2/licenses_example/requirements.txt000066400000000000000000000000351413625033700230240ustar00rootroot00000000000000pandas plotnine textdistance textdistance-4.2.2/logo.png000066400000000000000000000254411413625033700156760ustar00rootroot00000000000000PNG  IHDRlCsBIT|d pHYsgRtEXtSoftwarewww.inkscape.org< IDATxw\ґ&E@)6@DDpm1cīWL,7$VԨhl F &ye]i, ~>,[ΜfLyg1B!tiJNB!GBQT !@BQT !@BQT VX!Cp͛7!Swݻr̆>!B! @S׮]×_~G.[r%&L y`ݺu C_BQQTq`mm '''=sB@@@s$]3UVaǎK.E8999ǵkpa$''cǎaܸq>|:BHB`ll OOOlڴ >>>_:h6IUU]OZ%%%̚5 =/3gDxx#OH3wǏcɒ%jL>)))rΌB$'uM>Ql=&Mx̙:9gF!ٓ{lnn.mVYYYq{% rΊB$#u+#Gb֭ĉȩEعs'JKK1vX⦦"""/_Fjj*^|>[1JWnҥKYQQyTUU#F491cƈ-]ٙ3gQwa@eΞ=jkkc0###fnny-**3Fd;/9Da ~RR6lX=z0{{{\]]Y~fgyyyRrqWӧPX***8LEEEچ)11Qe@1۪@ < ~UU[xq06vX6d֧O,>%%%j*=%vluՕeWXX{穭Æ FvՋݼyS|rrr] ۷UԳgO6tP6|pֿFglѢEZEפ7 -9r$>|84hP6lKNNfAAAsR喑Ύ1 6߿ܼYFFOK1=S rQãiHvll,"""o>;7n 44mgʔ)شiqTVVTUU1i$>V9r eeepc1SNZq'O6:XXX ܯ_?8::J;zxx4y#7on ~PXNe#88/^MIKKí[ڜ4jkk1i$qikkoMWW&ן*̙3ؿ?^z۷,uS3fϑШSرc/{>ۜ`o"FGlǏq ޽رcwyM*q7^~ 1~FqEl߾oR_v\􄿿?f!%%/^đ#GpM@LL Dd}|˗1sLp :ǨQ0`FE\\._}q>}mΩY2ەhw%;dvbTWWCI?SÙQdUUU:vpp`/_:;wp1/_.u<1y$$$d ~W_^c,77WlΜ9dRRRxO>bJ;؏?$QXXfΜ)r/"GW\=={d'N+Nmm-ۼyvo%G;{ȥWWW)qW!C4yd-w}ӑ#GDtI[mm-;y$333hhh6*GKu);;yzzr񌌌XjjyFGG3===.ɓYEEqrrrE~(Rǘb˗s󈈈<ȫk֬*^``|> ٸq#/1 ֭̔8Ν;x:::6͛LCCx5lڵpE6VXXȦNbos(L}6큖w}.nnn\iܹsqqXii؟OKKcyV\\,u^)j߿?7_U&h b FT%>[[[^b ɉz1cI_QQ[NxJU>}*r?%6w\.ډxBWWW磪 dX8~87oāk׮q"""0avy)ݑ8q".\d"9qnn3,!CZl!mmm=[xy"44m!|˗/KϺu됖æM'lÆ M^̵sǖ-[xKYY'&&r7B>@}K|RQQ/>]7nL/Cttt Q>7XHJJBuu5TUUym``K.aҤIijg0l0sm6oތk6OҘx8Qhjjܹsضmlق2ѣ8z(3aoo;;;R7$%%ȑ#8}4[gFcccgU%%%q,<<<䜍(Ƭ&&&2'w 333u=法gkcq.\iӦŋܵ|@˗/u~1FlrDTUUvZ,X{ѣG ddd̙3sJJJ= 1cͭK iNbb">3;wNܹsmlg&>U|m޹T[#VԹ2iܭ[7 4/^7ꌍ~z_YYY}6z)S`ʔ)""!!?p]1|8z(մ+ܹs믿r`ҥ5k+q̜ڵk|ӦMs]SUUԴK.I=z􀻻;wi)** ;wę3gPWW[navZa6o{@@lݺUe5#LTcy'{ƚ5k䘍|ĝUuu5f͚{ mݺ:{lz||2ʝ9quu=z/aZ|9󫫫}v`ܸqerr `}]wӦM|تUXEEqCCCY>}HNNfVVV\ѣGs gZZZ\\???VYY)uyq1Ν2///N8Ѧ8+WruVbTVV[nݺqYhhhI_rؽ{wٙ'J3.ĉyɵikks簰00kkk'';6C6|p.СC%ͫ)\1cH>ӥ.\ãM={ę?>/H6h .6:L ~II +((hr:tel^z{^۷og***ءC$./^`_50`RUUUU^^bccY^D}VVȲ;fEEgϝ;'R|||XNN; KJJb\ٳgbjŋSKK:u1((Hd^l… ,==~UTTlܹsY~iĉP/"?6>'@>C>H&65M0M96|MMMvZА}짟~b7od٬ֲ~w_Ç3%%%y-WVV1cƈɉ},&&沪*V\\ٱcɓ'3==FNCCܹM$$$0SSS.:9s&;z(KJJbEEEDvY3''F~+IOOEFFעE.ϋ}}]~K-jd{1FߟwtV^^YNNeǏg+V9ZnzΝ;朄ݾ}YZZ744dfb+++c%%%,55ݺu9r-[ 2D az뭷صkxɑ1Ǐ񊈈=L6{lMwe~s%imG\!mnZbE[!c\2h2 x_״؜9sXllT˥)fffܤҦ#+0bR/ӵkJu%eeelR/3l2&U>O|}}abbҦyGEEСCb5133ʕ+annnܸO IDATu@ |g055;禤aӦM8~ 1j(L0^߫)رc}%]]]`6l/]ƚr52oiӦa6..?0~x Ο?;w"**J.p 3h"9MϟǶmpuZYY _]]ݻwcHNNn&&&}& `͚5-]ݻ7V^- ϝ;[kkk# kϱm6:uJ|Ə5dTڵ ǎ{WWWL2~~~2k, 66' BEE055E|;׈Ɲ;ws:::A޽ѧO8::V!nՔ"&&111A~~> ݻo6vTENN^z@ G;ݻw'O ??с1GGGtE1<|HKKCnn.򠩩 CCCC!..111Bvv6^| %%%hkk1tPdO!(ڝ$B|B!DP'B8oJLL4:6BZZ PYY UUUNeeeHKKCQQ}}}XZZKc#-- %%%:`aa!' ##_F]]444`ll ss);;YYY\KeMMMW^r˩a6٪^]]2Lumyy9QPPj{!O1dff˗PK URRm_uB-USS 梢ֆy롖0~xaZ={4<&h&h۵kW[}BG]]Z}_[뻸sN{᫯jS>hnjD;w;tR^hŋ;_wHF+ܻw׃ߎZd rrr|MKK |uuuo7ӧO}N<4ߞ={==&ߓ$~~Ú5k`hh(yŋ.;󃟟_;fXKwO?Ԏ4~5+֎JMMŚ5k}k׮mnjclܸa)O%$&&zA.\b )K۷oo1`dɒ󋎎k~-旕%ͭ{r͍×/_[noڵkWmܸQ1Z̯GNOVMMǎk~-7n8ի-j !BQ'B|B!DP'B|B!DP'B|B!DZ8_~֭[  43\ZZ>nl:i5R?Is,,,:Ng2l5R¢29-im$зo_$&&"$$GQQѷo_L2NNNNC[p!q9<{ 000OyءyiEitJʘ6mM&T:@;NKCCf¬YJedd@y!st BQT !@BQTAk ;BǎZG˰㠂} <<<<0~vΨsYd LMM=&_#SLk988`ƌQ3o<ׯƌcǶsF˴Ȭp]<{ (-- 333yy{{###qqqHOOGee%zCWWWuxJJ-+++S&9bbbx~ZZZ0`#:ggg$%%ɓ'xʠAgϞNOPoG666w2hL6Ox7779eyXZZRitjvvvw >!] =zt<ׇ}#TBV ~kCR B: XXX; BP=B! >!B! >!B! >!B! >!.4!Z[O!NB! >!B! >!B! >!B! >!T p !DZZZӧOݿbDza ##M9BcH]Ν+uBVO;99GBi#ggV#`b  \x***xI"[999x%Q[[u555rȐ" MMMXYYaĉpuumb|B!tnJBQT !@BQT !@BQT !@BQ ݑ^_IENDB`textdistance-4.2.2/logo.svg000066400000000000000000000246101413625033700157060ustar00rootroot00000000000000 image/svg+xml TextDistance textdistance-4.2.2/pyproject.toml000066400000000000000000000007301413625033700171360ustar00rootroot00000000000000[tool.dephell.main] from = "setup.py" tag = "v." [tool.dephell.pytest-pure] from = "setup.py" envs = ["test"] command = "python3 -m pytest -m 'not external' tests/" [tool.dephell.pytest-external] from = "setup.py" envs = ["test", "benchmark"] command = "python3 -m pytest tests/" [tool.dephell.isort] from = "setup.py" envs = ["test"] command = "python3 -m isort -rc ." [tool.dephell.flake8] from = {format = "pip", path = "requirements-flake.txt"} command = "flake8" textdistance-4.2.2/requirements-flake.txt000066400000000000000000000013271413625033700205710ustar00rootroot00000000000000flake8 flake8-bugbear # ^ https://github.com/PyCQA/flake8-bugbear pep8-naming # ^ https://github.com/PyCQA/pep8-naming flake8-commas # ^ https://github.com/PyCQA/flake8-commas flake8-quotes # ^ https://github.com/zheller/flake8-quotes flake8-blind-except # ^ https://github.com/elijahandrews/flake8-blind-except flake8-logging-format # ^ https://github.com/globality-corp/flake8-logging-format flake8-pep3101 # ^ https://github.com/gforcada/flake8-pep3101 flake8-string-format # ^ https://github.com/xZise/flake8-string-format flake8-mutable # ^ https://github.com/ebeweber/flake8-mutable flake8-tidy-imports # ^ https://github.com/adamchainz/flake8-tidy-imports pep8-naming # ^ https://github.com/PyCQA/pep8-naming textdistance-4.2.2/setup.cfg000066400000000000000000000010651413625033700160450ustar00rootroot00000000000000[metadata] description_file = README.md license_file = LICENSE [flake8] max-line-length=120 ignore=P101,P103,E241 exclude= .tox, .pytest_cache venvs/ [isort] skip=.tox,.pytest_cache,venvs line_length=120 combine_as_imports=true balanced_wrapping=true lines_after_imports=2 not_skip=__init__.py multi_line_output=5 import_heading_stdlib=built-in import_heading_thirdparty=external import_heading_firstparty=project import_heading_localfolder=app [tool:pytest] addopts = --strict-markers markers = external: tests that require external libs to run textdistance-4.2.2/setup.py000066400000000000000000000056751413625033700157510ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # external from setuptools import setup extras = { # enough for simple usage 'extras': [ 'abydos', 'jellyfish', # for DamerauLevenshtein 'numpy', # for SmithWaterman and other 'python-Levenshtein', # for Jaro and Levenshtein 'pyxDamerauLevenshtein', # for DamerauLevenshtein ], # needed for benchmarking, optimization and testing 'benchmark': [ # common 'abydos', 'jellyfish', 'numpy', 'python-Levenshtein', 'pyxDamerauLevenshtein', # slow 'distance', 'pylev', 'py_stringmatching', # other 'tabulate', # to draw the table with results ], 'test': [ 'hypothesis', 'isort', 'numpy', 'pytest', ], # for algos, from fastest to slowest, only faster than textdistance: 'DamerauLevenshtein': [ 'jellyfish', # only for text 'pyxDamerauLevenshtein', # for any iterators ], 'Hamming': [ 'python-Levenshtein', # only same length and strings 'jellyfish', # only strings, any length 'distance', # only same length, any iterators 'abydos', # any iterators ], 'Jaro': [ 'python-Levenshtein', # only text ], 'JaroWinkler': [ 'jellyfish', # only text ], 'Levenshtein': [ 'python-Levenshtein', # only text # yeah, other libs slower than textdistance ], } # backward compatibility extras['common'] = extras['extras'] extras['all'] = extras['benchmark'] # correct possible misspelling extras['extra'] = extras['extras'] extras['benchmarks'] = extras['benchmark'] try: long_description = open('README.md', encoding='utf-8').read() except TypeError: try: long_description = open('README.md').read() except UnicodeDecodeError: long_description = '' setup( name='textdistance', version='4.2.2', author='orsinium', author_email='gram@orsinium.dev', description='Compute distance between the two texts.', long_description=long_description, long_description_content_type='text/markdown', keywords='distance between text strings sequences iterators', packages=['textdistance', 'textdistance.algorithms'], package_data={'': ['*.json']}, python_requires='>=3.5', extras_require=extras, url='https://github.com/orsinium/textdistance', download_url='https://github.com/orsinium/textdistance/tarball/master', license='MIT', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Plugins', 'Intended Audience :: Developers', 'License :: OSI Approved :: MIT License', 'Programming Language :: Python', 'Topic :: Scientific/Engineering :: Human Machine Interfaces', ], ) textdistance-4.2.2/tests/000077500000000000000000000000001413625033700153645ustar00rootroot00000000000000textdistance-4.2.2/tests/__init__.py000066400000000000000000000000001413625033700174630ustar00rootroot00000000000000textdistance-4.2.2/tests/test_common.py000066400000000000000000000051231413625033700202660ustar00rootroot00000000000000# built-in from math import isclose # external import hypothesis import pytest # project import textdistance ALGS = ( textdistance.bag, textdistance.hamming, textdistance.levenshtein, textdistance.damerau_levenshtein, textdistance.jaro, textdistance.jaro_winkler, textdistance.mlipns, textdistance.lcsseq, textdistance.lcsstr, textdistance.ratcliff_obershelp, textdistance.jaccard, textdistance.sorensen, textdistance.tversky, textdistance.overlap, textdistance.cosine, textdistance.strcmp95, textdistance.monge_elkan, textdistance.mra, textdistance.prefix, textdistance.postfix, textdistance.identity, # textdistance.length, # numpy-based: # textdistance.gotoh, textdistance.needleman_wunsch, textdistance.smith_waterman, textdistance.editex, ) @pytest.mark.parametrize('alg', ALGS) @hypothesis.given( left=hypothesis.strategies.text(), right=hypothesis.strategies.text(), ) def test_normalization_range(left, right, alg): assert 0 <= alg.normalized_distance(left, right) <= 1 assert 0 <= alg.normalized_similarity(left, right) <= 1 @pytest.mark.parametrize('alg', ALGS) @hypothesis.given( left=hypothesis.strategies.text(), right=hypothesis.strategies.text(), ) def test_normalization_by_one(left, right, alg): d = alg.normalized_distance(left, right) s = alg.normalized_similarity(left, right) assert isclose(s + d, 1) @pytest.mark.parametrize('alg', ALGS) @hypothesis.given(text=hypothesis.strategies.text()) def test_normalization_same(text, alg): assert alg.normalized_distance(text, text) == 0 if alg is not textdistance.needleman_wunsch: assert alg.distance(text, text) == 0 assert alg.normalized_similarity(text, text) == 1 @pytest.mark.parametrize('alg', ALGS) @hypothesis.given( left=hypothesis.strategies.text(min_size=1), right=hypothesis.strategies.text(min_size=1), ) def test_normalization_monotonic(left, right, alg): nd = alg.normalized_distance(left, right) ns = alg.normalized_similarity(left, right) d = alg.distance(left, right) s = alg.similarity(left, right) assert (nd < ns) == (d < s) @pytest.mark.parametrize('alg', ALGS) def test_no_common_chars(alg): if alg is textdistance.editex: return assert alg.similarity('spam', 'qwer') == 0 @pytest.mark.parametrize('alg', ALGS) def test_empty(alg): assert alg.distance('', '') == 0 @pytest.mark.parametrize('alg', ALGS) def test_unequal_distance(alg): if alg.maximum('', 'qwertyui'): assert alg.distance('', 'qwertyui') > 0 textdistance-4.2.2/tests/test_compression/000077500000000000000000000000001413625033700207645ustar00rootroot00000000000000textdistance-4.2.2/tests/test_compression/__init__.py000066400000000000000000000000001413625033700230630ustar00rootroot00000000000000textdistance-4.2.2/tests/test_compression/test_arith_ncd.py000066400000000000000000000014471413625033700243360ustar00rootroot00000000000000# built-in from fractions import Fraction from math import isclose # external import pytest # project import textdistance ALG = textdistance.arith_ncd @pytest.mark.parametrize('left, right, expected', [ ('test', 'test', 1), ('test', 'nani', 2.1666666666666665), ]) def test_similarity(left, right, expected): actual = ALG(left, right) assert isclose(actual, expected) def test_make_probs(): alg = textdistance.ArithNCD(terminator='\x00') probs = alg._make_probs('lol', 'lal') assert probs['l'] == (Fraction(0, 1), Fraction(4, 7)) assert probs['o'][1] == Fraction(1, 7) assert probs['a'][1] == Fraction(1, 7) def test_arith_output(): alg = textdistance.ArithNCD(terminator='\x00') fraction = alg._compress('BANANA') assert fraction.numerator == 1525 textdistance-4.2.2/tests/test_compression/test_bwtrle_ncd.py000066400000000000000000000005331413625033700245210ustar00rootroot00000000000000# built-in from math import isclose # external import pytest # project import textdistance ALG = textdistance.bwtrle_ncd @pytest.mark.parametrize('left, right, expected', [ ('test', 'test', 0.6), ('test', 'nani', 0.8), ]) def test_similarity(left, right, expected): actual = ALG(left, right) assert isclose(actual, expected) textdistance-4.2.2/tests/test_compression/test_bz2_ncd.py000066400000000000000000000005321413625033700237160ustar00rootroot00000000000000# built-in from math import isclose # external import pytest # project import textdistance ALG = textdistance.bz2_ncd @pytest.mark.parametrize('left, right, expected', [ ('test', 'test', 0.08), ('test', 'nani', 0.16), ]) def test_similarity(left, right, expected): actual = ALG(left, right) assert isclose(actual, expected) textdistance-4.2.2/tests/test_compression/test_common.py000066400000000000000000000032431413625033700236670ustar00rootroot00000000000000# built-in from math import isclose # external import hypothesis import pytest # project import textdistance ALGS = ( textdistance.arith_ncd, textdistance.bwtrle_ncd, textdistance.bz2_ncd, # too slow, makes CI flaky # textdistance.lzma_ncd, textdistance.rle_ncd, textdistance.zlib_ncd, textdistance.sqrt_ncd, textdistance.entropy_ncd, ) @pytest.mark.parametrize('alg', ALGS) def test_monotonicity(alg): same = alg('test', 'test') similar = alg('test', 'text') diffirent = alg('test', 'nani') assert same <= similar <= diffirent @pytest.mark.parametrize('alg', ALGS) @hypothesis.given( left=hypothesis.strategies.text(), right=hypothesis.strategies.text(), ) def test_simmetry(left, right, alg): assert alg.similarity(left, right) == alg.similarity(right, left) assert alg.distance(left, right) == alg.distance(right, left) assert alg.normalized_similarity(left, right) == alg.normalized_similarity(right, left) assert alg.normalized_distance(left, right) == alg.normalized_distance(right, left) @pytest.mark.parametrize('alg', ALGS) @hypothesis.given( left=hypothesis.strategies.text(), right=hypothesis.strategies.text(), ) def test_is_normalized(left, right, alg): a = alg(left, right) d = alg.distance(left, right) nd = alg.normalized_distance(left, right) assert a == d == nd @pytest.mark.parametrize('alg', ALGS) @hypothesis.given( left=hypothesis.strategies.text(), right=hypothesis.strategies.text(), ) def test_normalized_by_one(left, right, alg): s = alg.normalized_similarity(left, right) d = alg.normalized_distance(left, right) assert isclose(s + d, 1) textdistance-4.2.2/tests/test_compression/test_entropy_ncd.py000066400000000000000000000034401413625033700247220ustar00rootroot00000000000000# built-in from math import isclose # external import hypothesis import pytest # project import textdistance ALG = textdistance.entropy_ncd @pytest.mark.parametrize('left, right, expected', [ ('test', 'test', 1), ('aaa', 'bbb', 0), ('test', 'nani', 0.6), ]) def test_similarity(left, right, expected): actual = ALG.similarity(left, right) assert isclose(actual, expected) @hypothesis.given(text=hypothesis.strategies.text(min_size=1)) def test_simmetry_compressor(text): rev = ''.join(reversed(text)) assert isclose(ALG._compress(text), ALG._compress(rev)) @hypothesis.given(text=hypothesis.strategies.text(min_size=1)) def test_idempotency_compressor(text): # I've modified idempotency to some kind of distributivity for constant. # Now it indicates that compressor really compress. assert ALG._get_size(text * 2) < ALG._get_size(text) * 2 @hypothesis.given( left=hypothesis.strategies.text(min_size=1), right=hypothesis.strategies.characters(), ) def test_monotonicity_compressor(left, right): if right in left: return assert ALG._get_size(left) <= ALG._get_size(left + right) @hypothesis.given( left1=hypothesis.strategies.text(min_size=1), left2=hypothesis.strategies.text(min_size=1), right=hypothesis.strategies.characters(), ) def test_distributivity_compressor(left1, left2, right): if right in left1 or right in left2: return actual1 = ALG._get_size(left1 + left2) + ALG._get_size(right) actual2 = ALG._get_size(left1 + right) + ALG._get_size(left2 + right) assert actual1 <= actual2 @hypothesis.given(text=hypothesis.strategies.text(min_size=1)) def test_normalization_range(text): assert 0 <= ALG.normalized_similarity(text, text) <= 1 assert 0 <= ALG.normalized_distance(text, text) <= 1 textdistance-4.2.2/tests/test_compression/test_sqrt_ncd.py000066400000000000000000000033201413625033700242100ustar00rootroot00000000000000# built-in from math import isclose # external import hypothesis import pytest # project import textdistance ALG = textdistance.sqrt_ncd @pytest.mark.parametrize('left, right, expected', [ ('test', 'test', 0.41421356237309503), ('test', 'nani', 1), ]) def test_similarity(left, right, expected): actual = ALG(left, right) assert isclose(actual, expected) @hypothesis.given(text=hypothesis.strategies.text(min_size=1)) def test_simmetry_compressor(text): rev = ''.join(reversed(text)) assert ALG._compress(text) == ALG._compress(rev) @hypothesis.given(text=hypothesis.strategies.text(min_size=1)) def test_idempotency_compressor(text): # I've modified idempotency to some kind of distributivity for constant. # Now it indicates that compressor really compress. assert ALG._get_size(text * 2) < ALG._get_size(text) * 2 @hypothesis.given( left=hypothesis.strategies.text(min_size=1), right=hypothesis.strategies.characters(), ) def test_monotonicity_compressor(left, right): if right in left: return assert ALG._get_size(left) <= ALG._get_size(left + right) @hypothesis.given( left1=hypothesis.strategies.text(min_size=1), left2=hypothesis.strategies.text(min_size=1), right=hypothesis.strategies.text(min_size=1), ) def test_distributivity_compressor(left1, left2, right): actual1 = ALG._get_size(left1 + left2) + ALG._get_size(right) actual2 = ALG._get_size(left1 + right) + ALG._get_size(left2 + right) assert actual1 <= actual2 @hypothesis.given(text=hypothesis.strategies.text(min_size=1)) def test_normalization_range(text): assert 0 <= ALG.normalized_similarity(text, text) <= 1 assert 0 <= ALG.normalized_distance(text, text) <= 1 textdistance-4.2.2/tests/test_edit/000077500000000000000000000000001413625033700173505ustar00rootroot00000000000000textdistance-4.2.2/tests/test_edit/__init__.py000066400000000000000000000000001413625033700214470ustar00rootroot00000000000000textdistance-4.2.2/tests/test_edit/test_damerau_levenshtein.py000066400000000000000000000014241413625033700250040ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.DamerauLevenshtein @pytest.mark.parametrize('left, right, expected', [ ('test', 'text', 1), ('test', 'tset', 1), ('test', 'qwy', 4), ('test', 'testit', 2), ('test', 'tesst', 1), ('test', 'tet', 1), ('cat', 'hat', 1), ('Niall', 'Neil', 3), ('aluminum', 'Catalan', 7), ('ATCG', 'TAGC', 2), ('ab', 'ba', 1), ('ab', 'cde', 3), ('ab', 'ac', 1), ('ab', 'ba', 1), ('ab', 'bc', 2), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert actual == expected actual = ALG(external=True)(left, right) assert actual == expected actual = ALG()._pure_python(left, right) assert actual == expected textdistance-4.2.2/tests/test_edit/test_editex.py000066400000000000000000000020621413625033700222430ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.Editex @pytest.mark.parametrize('left, right, expected', [ ('', '', 0), ('nelson', '', 12), ('', 'neilsen', 14), ('ab', 'a', 2), ('ab', 'c', 4), ('ALIE', 'ALI', 1), ('', 'MARTHA', 12), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert actual == expected actual = ALG(external=True)(left, right) assert actual == expected @pytest.mark.parametrize('left, right, params, expected', [ ('MARTHA', 'MARHTA', dict(match_cost=2), 12), ('MARTHA', 'MARHTA', dict(match_cost=4), 14), ('MARTHA', 'MARHTA', dict(group_cost=1, local=True), 3), ('MARTHA', 'MARHTA', dict(group_cost=2, local=True), 4), ('MARTHA', 'MARHTA', dict(mismatch_cost=4, local=True), 5), ]) def test_distance_with_params(left, right, params, expected): actual = ALG(external=False, **params)(left, right) assert actual == expected actual = ALG(external=True, **params)(left, right) assert actual == expected textdistance-4.2.2/tests/test_edit/test_gotoh.py000066400000000000000000000023641413625033700221060ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.Gotoh # https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm NW_MATRIX = { ('A', 'A'): 10, ('G', 'G'): 7, ('C', 'C'): 9, ('T', 'T'): 8, ('A', 'G'): -1, ('A', 'C'): -3, ('A', 'T'): -4, ('G', 'C'): -5, ('G', 'T'): -3, ('C', 'T'): 0, } def sim_ident(x, y): if x == y: return 1 else: return -1 @pytest.mark.parametrize('left, right, expected', [ ('GATTACA', 'GCATGCU', 0), ]) def test_distance_ident(left, right, expected): actual = ALG(gap_open=1, gap_ext=1, sim_func=sim_ident)(left, right) assert actual == expected @pytest.mark.parametrize('left, right, expected', [ ('GATTACA', 'GCATGCU', 0), ('AGACTAGTTAC', 'TGACGSTGC', 1.5), ('AGACTAGTTAC', 'CGAGACGT', 1), ]) def test_distance_ident_with_gap_05(left, right, expected): actual = ALG(gap_open=1, gap_ext=.5, sim_func=sim_ident)(left, right) assert actual == expected @pytest.mark.parametrize('left, right, expected', [ ('AGACTAGTTAC', 'CGAGACGT', -15), ]) def test_distance_ident_with_gap_5(left, right, expected): actual = ALG(gap_open=5, gap_ext=5, sim_func=sim_ident)(left, right) assert actual == expected textdistance-4.2.2/tests/test_edit/test_hamming.py000066400000000000000000000007471413625033700224110ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.Hamming @pytest.mark.parametrize('left, right, expected', [ ('test', 'text', 1), ('test', 'tset', 2), ('test', 'qwe', 4), ('test', 'testit', 2), ('test', 'tesst', 2), ('test', 'tet', 2), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert actual == expected actual = ALG(external=True)(left, right) assert actual == expected textdistance-4.2.2/tests/test_edit/test_jaro.py000066400000000000000000000015131413625033700217140ustar00rootroot00000000000000# built-in from math import isclose # external import pytest # project import textdistance ALG = textdistance.JaroWinkler @pytest.mark.parametrize('left, right, expected', [ ('hello', 'haloa', 0.7333333333333334), ('fly', 'ant', 0.0), ('frog', 'fog', 0.9166666666666666), ('ATCG', 'TAGC', 0.8333333333333334), ('MARTHA', 'MARHTA', 0.944444444), ('DWAYNE', 'DUANE', 0.822222222), ('DIXON', 'DICKSONX', 0.7666666666666666), # https://github.com/life4/textdistance/issues/41 ('Sint-Pietersplein 6, 9000 Gent', 'Test 10, 1010 Brussel', 0.5182539682539683), ]) def test_distance(left, right, expected): actual = ALG(winklerize=False, external=False)(left, right) assert isclose(actual, expected) actual = ALG(winklerize=False, external=True)(left, right) assert isclose(actual, expected) textdistance-4.2.2/tests/test_edit/test_jaro_winkler.py000066400000000000000000000013641413625033700234530ustar00rootroot00000000000000# built-in from math import isclose # external import pytest # project import textdistance ALG = textdistance.JaroWinkler @pytest.mark.parametrize('left, right, expected', [ ('elephant', 'hippo', 0.44166666666666665), ('fly', 'ant', 0.0), ('frog', 'fog', 0.925), ('MARTHA', 'MARHTA', 0.9611111111111111), ('DWAYNE', 'DUANE', 0.84), ('DIXON', 'DICKSONX', 0.8133333333333332), # https://github.com/life4/textdistance/issues/39 ('duck donald', 'duck daisy', 0.867272727272), ]) def test_distance(left, right, expected): actual = ALG(winklerize=True, external=False)(left, right) assert isclose(actual, expected) actual = ALG(winklerize=True, external=True)(left, right) assert isclose(actual, expected) textdistance-4.2.2/tests/test_edit/test_levenshtein.py000066400000000000000000000007531413625033700233120ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.Levenshtein @pytest.mark.parametrize('left, right, expected', [ ('test', 'text', 1), ('test', 'tset', 2), ('test', 'qwe', 4), ('test', 'testit', 2), ('test', 'tesst', 1), ('test', 'tet', 1), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert actual == expected actual = ALG(external=True)(left, right) assert actual == expected textdistance-4.2.2/tests/test_edit/test_matrix.py000066400000000000000000000013521413625033700222660ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.Matrix # https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm NW_MATRIX = { ('A', 'A'): 10, ('G', 'G'): 7, ('C', 'C'): 9, ('T', 'T'): 8, ('A', 'G'): -1, ('A', 'C'): -3, ('A', 'T'): -4, ('G', 'C'): -5, ('G', 'T'): -3, ('C', 'T'): 0, } @pytest.mark.parametrize('left, right, expected', [ ('', '', 1), ('', 'a', 0), ('abcd', 'abcd', 1), ('A', 'C', -3), ('G', 'G', 7), ('A', 'A', 10), ('T', 'A', -4), ('T', 'C', 0), ('A', 'G', -1), ('C', 'T', 0), ]) def test_distance(left, right, expected): actual = ALG(NW_MATRIX, symmetric=True)(left, right) assert actual == expected textdistance-4.2.2/tests/test_edit/test_mlipns.py000066400000000000000000000011061413625033700222610ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.MLIPNS @pytest.mark.parametrize('left, right, expected', [ ('', '', 1), ('a', '', 0), ('', 'a', 0), ('a', 'a', 1), ('ab', 'a', 1), ('abc', 'abc', 1), ('abc', 'abcde', 1), ('abcg', 'abcdeg', 1), ('abcg', 'abcdefg', 0), ('Tomato', 'Tamato', 1), ('ato', 'Tam', 1), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert actual == expected actual = ALG(external=True)(left, right) assert actual == expected textdistance-4.2.2/tests/test_edit/test_needleman_wunsch.py000066400000000000000000000024131413625033700243000ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.NeedlemanWunsch # https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm NW_MATRIX = { ('A', 'A'): 10, ('G', 'G'): 7, ('C', 'C'): 9, ('T', 'T'): 8, ('A', 'G'): -1, ('A', 'C'): -3, ('A', 'T'): -4, ('G', 'C'): -5, ('G', 'T'): -3, ('C', 'T'): 0, } @pytest.mark.parametrize('left, right, expected', [ ('AGACTAGTTAC', 'CGAGACGT', 16), ]) def test_distance_matrix(left, right, expected): sim_matrix = textdistance.Matrix(NW_MATRIX, symmetric=True) actual = ALG(gap_cost=5, sim_func=sim_matrix)(left, right) assert actual == expected def sim_ident(x, y): if x == y: return 1 else: return -1 @pytest.mark.parametrize('left, right, expected', [ ('GATTACA', 'GCATGCU', 0), ]) def test_distance_ident(left, right, expected): actual = ALG(sim_func=sim_ident)(left, right) assert actual == expected @pytest.mark.parametrize('left, right, expected', [ ('CGATATCAG', 'TGACGSTGC', -5), ('AGACTAGTTAC', 'TGACGSTGC', -7), ('AGACTAGTTAC', 'CGAGACGT', -15), ]) def test_distance_ident_with_gap_5(left, right, expected): actual = ALG(gap_cost=5, sim_func=sim_ident)(left, right) assert actual == expected textdistance-4.2.2/tests/test_edit/test_smith_waterman.py000066400000000000000000000024051413625033700240040ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.SmithWaterman # https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm NW_MATRIX = { ('A', 'A'): 10, ('G', 'G'): 7, ('C', 'C'): 9, ('T', 'T'): 8, ('A', 'G'): -1, ('A', 'C'): -3, ('A', 'T'): -4, ('G', 'C'): -5, ('G', 'T'): -3, ('C', 'T'): 0, } @pytest.mark.parametrize('left, right, expected', [ ('AGACTAGTTAC', 'CGAGACGT', 26), ]) def test_distance_matrix(left, right, expected): sim_matrix = textdistance.Matrix(NW_MATRIX, symmetric=True) actual = ALG(gap_cost=5, sim_func=sim_matrix)(left, right) assert actual == expected def sim_ident(x, y): if x == y: return 1 else: return -1 @pytest.mark.parametrize('left, right, expected', [ ('GATTACA', 'GCATGCU', 0), ]) def test_distance_ident(left, right, expected): actual = ALG(sim_func=sim_ident)(left, right) assert actual == expected @pytest.mark.parametrize('left, right, expected', [ ('CGATATCAG', 'TGACGSTGC', 0), ('AGACTAGTTAC', 'TGACGSTGC', 1), ('AGACTAGTTAC', 'CGAGACGT', 0), ]) def test_distance_ident_with_gap_5(left, right, expected): actual = ALG(gap_cost=5, sim_func=sim_ident)(left, right) assert actual == expected textdistance-4.2.2/tests/test_edit/test_strcmp95.py000066400000000000000000000010441413625033700224460ustar00rootroot00000000000000# built-in from math import isclose # external import pytest # project import textdistance ALG = textdistance.StrCmp95 @pytest.mark.parametrize('left, right, expected', [ ('MARTHA', 'MARHTA', 0.9611111111111111), ('DWAYNE', 'DUANE', 0.873), ('DIXON', 'DICKSONX', 0.839333333), ('TEST', 'TEXT', 0.9066666666666666), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert isclose(actual, expected) actual = ALG(external=True)(left, right) assert isclose(actual, expected) textdistance-4.2.2/tests/test_external.py000066400000000000000000000056041413625033700206240ustar00rootroot00000000000000# built-in from math import isclose # external import hypothesis import pytest # project import textdistance from textdistance.libraries import prototype libraries = prototype.clone() @pytest.mark.external @pytest.mark.parametrize('alg', libraries.get_algorithms()) @hypothesis.given( left=hypothesis.strategies.text(min_size=1), right=hypothesis.strategies.text(min_size=1), ) def test_compare(left, right, alg): for lib in libraries.get_libs(alg): conditions = lib.conditions or {} internal_func = getattr(textdistance, alg)(external=False, **conditions) external_func = lib.get_function() if external_func is None: raise RuntimeError('cannot import {}'.format(str(lib))) if not lib.check_conditions(internal_func, left, right): continue int_result = internal_func(left, right) s1, s2 = lib.prepare(left, right) ext_result = external_func(s1, s2) assert isclose(int_result, ext_result), str(lib) @pytest.mark.external @pytest.mark.parametrize('alg', libraries.get_algorithms()) @hypothesis.given( left=hypothesis.strategies.text(min_size=1), right=hypothesis.strategies.text(min_size=1), ) def test_qval(left, right, alg): for lib in libraries.get_libs(alg): conditions = lib.conditions or {} internal_func = getattr(textdistance, alg)(external=False, **conditions) external_func = lib.get_function() # algorithm doesn't support q-grams if not hasattr(internal_func, 'qval'): continue for qval in (None, 1, 2, 3): internal_func.qval = qval # if qval unsopporting already set for lib s1, s2 = internal_func._get_sequences(left, right) if not lib.check_conditions(internal_func, s1, s2): continue # test int_result = internal_func(left, right) s1, s2 = lib.prepare(s1, s2) ext_result = external_func(s1, s2) assert isclose(int_result, ext_result), str(lib) @pytest.mark.external @pytest.mark.parametrize('alg', libraries.get_algorithms()) @hypothesis.given( left=hypothesis.strategies.lists(hypothesis.strategies.integers()), right=hypothesis.strategies.lists(hypothesis.strategies.integers()), ) def test_list_of_numbers(left, right, alg): for lib in libraries.get_libs(alg): conditions = lib.conditions or {} internal_func = getattr(textdistance, alg)(external=False, **conditions) external_func = lib.get_function() if external_func is None: raise RuntimeError('cannot import {}'.format(str(lib))) if not lib.check_conditions(internal_func, left, right): continue int_result = internal_func(left, right) s1, s2 = lib.prepare(left, right) ext_result = external_func(s1, s2) assert isclose(int_result, ext_result), str(lib) textdistance-4.2.2/tests/test_phonetic/000077500000000000000000000000001413625033700202345ustar00rootroot00000000000000textdistance-4.2.2/tests/test_phonetic/__init__.py000066400000000000000000000000001413625033700223330ustar00rootroot00000000000000textdistance-4.2.2/tests/test_phonetic/test_editex.py000066400000000000000000000030231413625033700231250ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.Editex @pytest.mark.parametrize('left, right, expected', [ # https://github.com/chrislit/abydos/blob/master/tests/distance/test_distance_editex.py ('', '', 0), ('nelson', '', len('nelson') * 2), ('', 'neilsen', len('neilsen') * 2), ('ab', 'a', 2), ('ab', 'c', 4), ('nelson', 'neilsen', 2), ('neilsen', 'nelson', 2), ('niall', 'neal', 1), ('neal', 'niall', 1), ('niall', 'nihal', 2), ('nihal', 'niall', 2), ('neal', 'nihl', 3), ('nihl', 'neal', 3), # https://anhaidgroup.github.io/py_stringmatching/v0.3.x/Editex.html ('cat', 'hat', 2), ('Niall', 'Neil', 2), ('aluminum', 'Catalan', 12), ('ATCG', 'TAGC', 6), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert actual == expected actual = ALG(external=True)(left, right) assert actual == expected @pytest.mark.parametrize('left, right, expected', [ ('', '', 0), ('nelson', '', 12), ('', 'neilsen', 14), ('ab', 'a', 2), ('ab', 'c', 2), ('nelson', 'neilsen', 2), ('neilsen', 'nelson', 2), ('niall', 'neal', 1), ('neal', 'niall', 1), ('niall', 'nihal', 2), ('nihal', 'niall', 2), ('neal', 'nihl', 3), ('nihl', 'neal', 3), ]) def test_local(left, right, expected): actual = ALG(external=False, local=True)(left, right) assert actual == expected actual = ALG(external=True, local=True)(left, right) assert actual == expected textdistance-4.2.2/tests/test_sequence/000077500000000000000000000000001413625033700202335ustar00rootroot00000000000000textdistance-4.2.2/tests/test_sequence/__init__.py000066400000000000000000000000001413625033700223320ustar00rootroot00000000000000textdistance-4.2.2/tests/test_sequence/test_lcsseq.py000066400000000000000000000017001413625033700231340ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.LCSSeq @pytest.mark.parametrize('left, right, expected', [ ('ab', 'cd', ''), ('abcd', 'abcd', 'abcd'), ('test', 'text', 'tet'), ('thisisatest', 'testing123testing', 'tsitest'), ('DIXON', 'DICKSONX', 'DION'), ('random exponential', 'layer activation', 'ratia'), ('a' * 80, 'a' * 80, 'a' * 80), ('a' * 80, 'b' * 80, ''), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert actual == expected actual = ALG(external=True)(left, right) assert actual == expected @pytest.mark.parametrize('seqs, expected', [ (('a', 'b', 'c'), ''), (('a', 'a', 'a'), 'a'), (('test', 'text', 'tempest'), 'tet'), ]) def test_distance_multiseq(seqs, expected): actual = ALG(external=False)(*seqs) assert actual == expected actual = ALG(external=True)(*seqs) assert actual == expected textdistance-4.2.2/tests/test_sequence/test_lcsstr.py000066400000000000000000000013341413625033700231570ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.LCSStr @pytest.mark.parametrize('left, right, expected', [ # prefix ('ab', 'abcd', 'ab'), ('abcd', 'ab', 'ab'), # middle ('abcd', 'bc', 'bc'), ('bc', 'abcd', 'bc'), # suffix ('abcd', 'cd', 'cd'), ('abcd', 'cd', 'cd'), # no ('abcd', 'ef', ''), ('ef', 'abcd', ''), # long # https://github.com/life4/textdistance/issues/40 ('MYTEST' * 100, 'TEST', 'TEST'), ('TEST', 'MYTEST' * 100, 'TEST'), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert actual == expected actual = ALG(external=True)(left, right) assert actual == expected textdistance-4.2.2/tests/test_token/000077500000000000000000000000001413625033700175435ustar00rootroot00000000000000textdistance-4.2.2/tests/test_token/__init__.py000066400000000000000000000000001413625033700216420ustar00rootroot00000000000000textdistance-4.2.2/tests/test_token/test_bag.py000066400000000000000000000006521413625033700217100ustar00rootroot00000000000000# external import pytest # project import textdistance ALG = textdistance.Bag @pytest.mark.parametrize('left, right, expected', [ ('qwe', 'qwe', 0), ('qwe', 'erty', 3), ('qwe', 'ewq', 0), ('qwe', 'rtys', 4), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert actual == expected actual = ALG(external=True)(left, right) assert actual == expected textdistance-4.2.2/tests/test_token/test_cosine.py000066400000000000000000000007221413625033700224350ustar00rootroot00000000000000# built-in from math import isclose # external import pytest # project import textdistance ALG = textdistance.Cosine @pytest.mark.parametrize('left, right, expected', [ ('test', 'text', 3.0 / 4), ('nelson', 'neilsen', 5.0 / pow(6 * 7, .5)), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert isclose(actual, expected) actual = ALG(external=True)(left, right) assert isclose(actual, expected) textdistance-4.2.2/tests/test_token/test_jaccard.py000066400000000000000000000020741413625033700225460ustar00rootroot00000000000000# built-in from math import isclose # external import hypothesis import pytest # project import textdistance ALG = textdistance.Jaccard @pytest.mark.parametrize('left, right, expected', [ ('test', 'text', 3.0 / 5), ('nelson', 'neilsen', 5.0 / 8), ('decide', 'resize', 3.0 / 9), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert isclose(actual, expected) actual = ALG(external=True)(left, right) assert isclose(actual, expected) @hypothesis.given( left=hypothesis.strategies.text(), right=hypothesis.strategies.text(), ) def test_compare_with_tversky(left, right): td = textdistance.Tversky(ks=[1, 1]).distance(left, right) jd = ALG().distance(left, right) assert isclose(jd, td) @hypothesis.given( left=hypothesis.strategies.text(), right=hypothesis.strategies.text(), ) def test_compare_with_tversky_as_set(left, right): td = textdistance.Tversky(ks=[1, 1], as_set=True).distance(left, right) jd = ALG(as_set=True).distance(left, right) assert isclose(jd, td) textdistance-4.2.2/tests/test_token/test_monge_elkan.py000066400000000000000000000006561413625033700234420ustar00rootroot00000000000000# built-in from math import isclose # external import pytest # project import textdistance ALG = textdistance.MongeElkan @pytest.mark.parametrize('left, right, expected', [ (['Niall'], ['Neal'], .805), (['Niall'], ['Nigel'], 0.7866666666666667), ]) def test_similarity(left, right, expected): actual = ALG(qval=1, algorithm=textdistance.jaro_winkler).similarity(left, right) assert isclose(actual, expected) textdistance-4.2.2/tests/test_token/test_overlap.py000066400000000000000000000007531413625033700226310ustar00rootroot00000000000000# built-in from math import isclose # external import pytest # project import textdistance ALG = textdistance.Overlap @pytest.mark.parametrize('left, right, expected', [ ('test', 'text', 3.0 / 4), ('testme', 'textthis', 4.0 / 6), ('nelson', 'neilsen', 5.0 / 6), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert isclose(actual, expected) actual = ALG(external=True)(left, right) assert isclose(actual, expected) textdistance-4.2.2/tests/test_token/test_sorensen.py000066400000000000000000000017761413625033700230230ustar00rootroot00000000000000# built-in from math import isclose # external import hypothesis import pytest # project import textdistance ALG = textdistance.Sorensen @pytest.mark.parametrize('left, right, expected', [ ('test', 'text', 2.0 * 3 / 8), ]) def test_distance(left, right, expected): actual = ALG(external=False)(left, right) assert isclose(actual, expected) actual = ALG(external=True)(left, right) assert isclose(actual, expected) @hypothesis.given( left=hypothesis.strategies.text(), right=hypothesis.strategies.text(), ) def test_compare_with_tversky(left, right): td = textdistance.Tversky(ks=[.5, .5]).distance(left, right) jd = ALG().distance(left, right) assert isclose(jd, td) @hypothesis.given( left=hypothesis.strategies.text(), right=hypothesis.strategies.text(), ) def test_compare_with_tversky_as_set(left, right): td = textdistance.Tversky(ks=[.5, .5], as_set=True).distance(left, right) jd = ALG(as_set=True).distance(left, right) assert isclose(jd, td) textdistance-4.2.2/textdistance/000077500000000000000000000000001413625033700167215ustar00rootroot00000000000000textdistance-4.2.2/textdistance/__init__.py000066400000000000000000000005431413625033700210340ustar00rootroot00000000000000""" TextDistance. Compute distance between sequences. 30+ algorithms, pure python implementation, common interface. """ # main package info __title__ = 'TextDistance' __version__ = '4.2.2' __author__ = 'Gram (@orsinium)' __license__ = 'MIT' # version synonym VERSION = __version__ # app from .algorithms import * # noQA from .utils import * # noQA textdistance-4.2.2/textdistance/algorithms/000077500000000000000000000000001413625033700210725ustar00rootroot00000000000000textdistance-4.2.2/textdistance/algorithms/__init__.py000066400000000000000000000003311413625033700232000ustar00rootroot00000000000000 # app from .compression_based import * # noQA from .edit_based import * # noQA from .phonetic import * # noQA from .sequence_based import * # noQA from .simple import * # noQA from .token_based import * # noQA textdistance-4.2.2/textdistance/algorithms/base.py000066400000000000000000000131021413625033700223530ustar00rootroot00000000000000# built-in from collections import Counter from contextlib import suppress # app from ..libraries import prototype from ..utils import find_ngrams libraries = prototype.clone() libraries.optimize() class Base: def __init__(self, qval=1, external=True): self.qval = qval self.external = external def __call__(self, *sequences): raise NotImplementedError @staticmethod def maximum(*sequences): """Get maximum possible value """ return max(map(len, sequences)) def distance(self, *sequences): """Get distance between sequences """ return self(*sequences) def similarity(self, *sequences): """Get sequences similarity. similarity = maximum - distance """ return self.maximum(*sequences) - self.distance(*sequences) def normalized_distance(self, *sequences): """Get distance from 0 to 1 """ maximum = self.maximum(*sequences) if maximum == 0: return 0 return self.distance(*sequences) / maximum def normalized_similarity(self, *sequences): """Get similarity from 0 to 1 normalized_similarity = 1 - normalized_distance """ return 1 - self.normalized_distance(*sequences) def external_answer(self, *sequences): """Try to get answer from known external libraries. """ # if this feature disabled if not getattr(self, 'external', False): return # all external libs doesn't support test_func if hasattr(self, 'test_func') and self.test_func is not self._ident: return # try to get external libs for algorithm libs = libraries.get_libs(self.__class__.__name__) for lib in libs: # if conditions not satisfied if not lib.check_conditions(self, *sequences): continue # if library is not installed yet if not lib.get_function(): continue prepared_sequences = lib.prepare(*sequences) # fail side libraries silently and try next libs with suppress(Exception): return lib.func(*prepared_sequences) def quick_answer(self, *sequences): """Try to get answer quick without main implementation calling. If no sequences, 1 sequence or all sequences are equal then return 0. If any sequence are empty then return maximum. And in finish try to get external answer. """ if not sequences: return 0 if len(sequences) == 1: return 0 if self._ident(*sequences): return 0 if not all(sequences): return self.maximum(*sequences) # try get answer from external libs answer = self.external_answer(*sequences) if answer is not None: return answer @staticmethod def _ident(*elements): """Return True if all sequences are equal. """ try: # for hashable elements return len(set(elements)) == 1 except TypeError: # for unhashable elements for e1, e2 in zip(elements, elements[1:]): if e1 != e2: return False return True def _get_sequences(self, *sequences): """Prepare sequences. qval=None: split text by words qval=1: do not split sequences. For text this is mean comparing by letters. qval>1: split sequences by q-grams """ # by words if not self.qval: return [s.split() for s in sequences] # by chars if self.qval == 1: return sequences # by n-grams return [find_ngrams(s, self.qval) for s in sequences] def _get_counters(self, *sequences): """Prepare sequences and convert it to Counters. """ # already Counters if all(isinstance(s, Counter) for s in sequences): return sequences return [Counter(s) for s in self._get_sequences(*sequences)] def _intersect_counters(self, *sequences): intersection = sequences[0].copy() for s in sequences[1:]: intersection &= s return intersection def _union_counters(self, *sequences): union = sequences[0].copy() for s in sequences[1:]: union |= s return union def _sum_counters(self, *sequences): result = sequences[0].copy() for s in sequences[1:]: result += s return result def _count_counters(self, counter): """Return all elements count from Counter """ if getattr(self, 'as_set', False): return len(set(counter)) else: return sum(counter.values()) def __repr__(self): return '{name}({data})'.format( name=type(self).__name__, data=self.__dict__, ) class BaseSimilarity(Base): def distance(self, *sequences): return self.maximum(*sequences) - self.similarity(*sequences) def similarity(self, *sequences): return self(*sequences) def quick_answer(self, *sequences): if not sequences: return self.maximum(*sequences) if len(sequences) == 1: return self.maximum(*sequences) if self._ident(*sequences): return self.maximum(*sequences) if not all(sequences): return 0 # try get answer from external libs answer = self.external_answer(*sequences) if answer is not None: return answer textdistance-4.2.2/textdistance/algorithms/compression_based.py000066400000000000000000000165701413625033700251540ustar00rootroot00000000000000# built-in import codecs import math from collections import Counter from fractions import Fraction from itertools import groupby, permutations # app from .base import Base as _Base try: import lzma except ImportError: lzma = None __all__ = [ 'ArithNCD', 'LZMANCD', 'BZ2NCD', 'RLENCD', 'BWTRLENCD', 'ZLIBNCD', 'SqrtNCD', 'EntropyNCD', 'bz2_ncd', 'lzma_ncd', 'arith_ncd', 'rle_ncd', 'bwtrle_ncd', 'zlib_ncd', 'sqrt_ncd', 'entropy_ncd', ] try: string_types = (str, unicode) except NameError: string_types = (str, ) class _NCDBase(_Base): """Normalized compression distance (NCD) https://articles.orsinium.dev/other/ncd/ https://en.wikipedia.org/wiki/Normalized_compression_distance#Normalized_compression_distance """ qval = 1 def __init__(self, qval=1): self.qval = qval def maximum(self, *sequences): return 1 def _get_size(self, data): return len(self._compress(data)) def __call__(self, *sequences): if not sequences: return 0 sequences = self._get_sequences(*sequences) concat_len = float('Inf') empty = type(sequences[0])() for data in permutations(sequences): if isinstance(empty, (str, bytes)): data = empty.join(data) else: data = sum(data, empty) concat_len = min(concat_len, self._get_size(data)) compressed_lens = [self._get_size(s) for s in sequences] max_len = max(compressed_lens) if max_len == 0: return 0 return (concat_len - min(compressed_lens) * (len(sequences) - 1)) / max_len class _BinaryNCDBase(_NCDBase): def __init__(self): pass def __call__(self, *sequences): if not sequences: return 0 if isinstance(sequences[0], string_types): sequences = [s.encode('utf-8') for s in sequences] return super().__call__(*sequences) class ArithNCD(_NCDBase): """Arithmetic coding https://github.com/gw-c/arith http://www.drdobbs.com/cpp/data-compression-with-arithmetic-encodin/240169251 https://en.wikipedia.org/wiki/Arithmetic_coding """ def __init__(self, base=2, terminator=None, qval=1): self.base = base self.terminator = terminator self.qval = qval def _make_probs(self, *sequences): """ https://github.com/gw-c/arith/blob/master/arith.py """ sequences = self._get_counters(*sequences) counts = self._sum_counters(*sequences) if self.terminator is not None: counts[self.terminator] = 1 total_letters = sum(counts.values()) prob_pairs = {} cumulative_count = 0 counts = sorted(counts.items(), key=lambda x: (x[1], x[0]), reverse=True) for char, current_count in counts: prob_pairs[char] = ( Fraction(cumulative_count, total_letters), Fraction(current_count, total_letters), ) cumulative_count += current_count assert cumulative_count == total_letters return prob_pairs def _get_range(self, data, probs): if self.terminator is not None: if self.terminator in data: data = data.replace(self.terminator, '') data += self.terminator start = Fraction(0, 1) width = Fraction(1, 1) for char in data: prob_start, prob_width = probs[char] start += prob_start * width width *= prob_width return start, start + width def _compress(self, data): probs = self._make_probs(data) start, end = self._get_range(data=data, probs=probs) output_fraction = Fraction(0, 1) output_denominator = 1 while not (start <= output_fraction < end): output_numerator = 1 + ((start.numerator * output_denominator) // start.denominator) output_fraction = Fraction(output_numerator, output_denominator) output_denominator *= 2 return output_fraction def _get_size(self, data): numerator = self._compress(data).numerator if numerator == 0: return 0 return math.ceil(math.log(numerator, self.base)) class RLENCD(_NCDBase): """Run-length encoding https://en.wikipedia.org/wiki/Run-length_encoding """ def _compress(self, data): new_data = [] for k, g in groupby(data): n = len(list(g)) if n > 2: new_data.append(str(n) + k) elif n == 1: new_data.append(k) else: new_data.append(2 * k) return ''.join(new_data) class BWTRLENCD(RLENCD): """ https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform https://en.wikipedia.org/wiki/Run-length_encoding """ def __init__(self, terminator='\0'): self.terminator = terminator def _compress(self, data): if not data: data = self.terminator elif self.terminator not in data: data += self.terminator modified = sorted(data[i:] + data[:i] for i in range(len(data))) data = ''.join([subdata[-1] for subdata in modified]) return super()._compress(data) # -- NORMAL COMPRESSORS -- # class SqrtNCD(_NCDBase): """Square Root based NCD Size of compressed data equals to sum of square roots of counts of every element in the input sequence. """ def __init__(self, qval=1): self.qval = qval def _compress(self, data): return {element: math.sqrt(count) for element, count in Counter(data).items()} def _get_size(self, data): return sum(self._compress(data).values()) class EntropyNCD(_NCDBase): """Entropy based NCD Get Entropy of input secueance as a size of compressed data. https://en.wikipedia.org/wiki/Entropy_(information_theory) https://en.wikipedia.org/wiki/Entropy_encoding """ def __init__(self, qval=1, coef=1, base=2): self.qval = qval self.coef = coef self.base = base def _compress(self, data): total_count = len(data) entropy = 0.0 for element_count in Counter(data).values(): p = element_count / total_count entropy -= p * math.log(p, self.base) assert entropy >= 0 return entropy # # redundancy: # unique_count = len(counter) # absolute_entropy = math.log(unique_count, 2) / unique_count # return absolute_entropy - entropy / unique_count def _get_size(self, data): return self.coef + self._compress(data) # -- BINARY COMPRESSORS -- # class BZ2NCD(_BinaryNCDBase): """ https://en.wikipedia.org/wiki/Bzip2 """ def _compress(self, data): return codecs.encode(data, 'bz2_codec')[15:] class LZMANCD(_BinaryNCDBase): """ https://en.wikipedia.org/wiki/LZMA """ def _compress(self, data): if not lzma: raise ImportError('Please, install the PylibLZMA module') return lzma.compress(data)[14:] class ZLIBNCD(_BinaryNCDBase): """ https://en.wikipedia.org/wiki/Zlib """ def _compress(self, data): return codecs.encode(data, 'zlib_codec')[2:] arith_ncd = ArithNCD() bwtrle_ncd = BWTRLENCD() bz2_ncd = BZ2NCD() lzma_ncd = LZMANCD() rle_ncd = RLENCD() zlib_ncd = ZLIBNCD() sqrt_ncd = SqrtNCD() entropy_ncd = EntropyNCD() textdistance-4.2.2/textdistance/algorithms/edit_based.py000066400000000000000000000570701413625033700235400ustar00rootroot00000000000000# built-in from collections import defaultdict from itertools import zip_longest # app from .base import Base as _Base, BaseSimilarity as _BaseSimilarity try: import numpy except ImportError: numpy = None __all__ = [ 'Hamming', 'MLIPNS', 'Levenshtein', 'DamerauLevenshtein', 'Jaro', 'JaroWinkler', 'StrCmp95', 'NeedlemanWunsch', 'Gotoh', 'SmithWaterman', 'hamming', 'mlipns', 'levenshtein', 'damerau_levenshtein', 'jaro', 'jaro_winkler', 'strcmp95', 'needleman_wunsch', 'gotoh', 'smith_waterman', ] class Hamming(_Base): """ Compute the Hamming distance between the two or more sequences. The Hamming distance is the number of differing items in ordered sequences. https://en.wikipedia.org/wiki/Hamming_distance """ def __init__(self, qval=1, test_func=None, truncate=False, external=True): self.qval = qval self.test_func = test_func or self._ident self.truncate = truncate self.external = external def __call__(self, *sequences): sequences = self._get_sequences(*sequences) result = self.quick_answer(*sequences) if result is not None: return result _zip = zip if self.truncate else zip_longest return sum([not self.test_func(*es) for es in _zip(*sequences)]) class Levenshtein(_Base): """ Compute the absolute Levenshtein distance between the two sequences. The Levenshtein distance is the minimum number of edit operations necessary for transforming one sequence into the other. The edit operations allowed are: * deletion: ABC -> BC, AC, AB * insertion: ABC -> ABCD, EABC, AEBC.. * substitution: ABC -> ABE, ADC, FBC.. https://en.wikipedia.org/wiki/Levenshtein_distance TODO: https://gist.github.com/kylebgorman/1081951/9b38b7743a3cb5167ab2c6608ac8eea7fc629dca """ def __init__(self, qval=1, test_func=None, external=True): self.qval = qval self.test_func = test_func or self._ident self.external = external def _recursive(self, s1, s2): # TODO: more than 2 sequences support if not s1 or not s2: return len(s1) + len(s2) if self.test_func(s1[-1], s2[-1]): return self(s1[:-1], s2[:-1]) # deletion/insertion d = min( self(s1[:-1], s2), self(s1, s2[:-1]), ) # substitution s = self(s1[:-1], s2[:-1]) return min(d, s) + 1 def _cicled(self, s1, s2): """ source: https://github.com/jamesturk/jellyfish/blob/master/jellyfish/_jellyfish.py#L18 """ rows = len(s1) + 1 cols = len(s2) + 1 prev = None if numpy: cur = numpy.arange(cols) else: cur = range(cols) for r in range(1, rows): prev, cur = cur, [r] + [0] * (cols - 1) for c in range(1, cols): deletion = prev[c] + 1 insertion = cur[c - 1] + 1 dist = self.test_func(s1[r - 1], s2[c - 1]) edit = prev[c - 1] + (not dist) cur[c] = min(edit, deletion, insertion) return cur[-1] def __call__(self, s1, s2): s1, s2 = self._get_sequences(s1, s2) result = self.quick_answer(s1, s2) if result is not None: return result return self._cicled(s1, s2) class DamerauLevenshtein(_Base): """ Compute the absolute Damerau-Levenshtein distance between the two sequences. The Damerau-Levenshtein distance is the minimum number of edit operations necessary for transforming one sequence into the other. The edit operations allowed are: * deletion: ABC -> BC, AC, AB * insertion: ABC -> ABCD, EABC, AEBC.. * substitution: ABC -> ABE, ADC, FBC.. * transposition: ABC -> ACB, BAC https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance """ def __init__(self, qval=1, test_func=None, external=True): self.qval = qval self.test_func = test_func or self._ident self.external = external def _numpy(self, s1, s2): # TODO: doesn't pass tests, need improve d = numpy.zeros([len(s1) + 1, len(s2) + 1], dtype=int) # matrix for i in range(-1, len(s1) + 1): d[i][-1] = i + 1 for j in range(-1, len(s2) + 1): d[-1][j] = j + 1 for i, cs1 in enumerate(s1): for j, cs2 in enumerate(s2): cost = int(not self.test_func(cs1, cs2)) # ^ 0 if equal, 1 otherwise d[i][j] = min( d[i - 1][j] + 1, # deletion d[i][j - 1] + 1, # insertion d[i - 1][j - 1] + cost, # substitution ) # transposition if not i or not j: continue if not self.test_func(cs1, s2[j - 1]): continue d[i][j] = min( d[i][j], d[i - 2][j - 2] + cost, ) return d[len(s1) - 1][len(s2) - 1] def _pure_python(self, s1, s2): """ https://www.guyrutenberg.com/2008/12/15/damerau-levenshtein-distance-in-python/ """ d = {} # matrix for i in range(-1, len(s1) + 1): d[i, -1] = i + 1 for j in range(-1, len(s2) + 1): d[-1, j] = j + 1 for i, cs1 in enumerate(s1): for j, cs2 in enumerate(s2): cost = int(not self.test_func(cs1, cs2)) # ^ 0 if equal, 1 otherwise d[i, j] = min( d[i - 1, j] + 1, # deletion d[i, j - 1] + 1, # insertion d[i - 1, j - 1] + cost, # substitution ) # transposition if not i or not j: continue if not self.test_func(cs1, s2[j - 1]): continue if not self.test_func(s1[i - 1], cs2): continue d[i, j] = min( d[i, j], d[i - 2, j - 2] + cost, ) return d[len(s1) - 1, len(s2) - 1] def __call__(self, s1, s2): s1, s2 = self._get_sequences(s1, s2) result = self.quick_answer(s1, s2) if result is not None: return result # if numpy: # return self._numpy(s1, s2) # else: return self._pure_python(s1, s2) class JaroWinkler(_BaseSimilarity): """ Computes the Jaro-Winkler measure between two strings. The Jaro-Winkler measure is designed to capture cases where two strings have a low Jaro score, but share a prefix. and thus are likely to match. https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro.js https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro-winkler.js """ def __init__(self, long_tolerance=False, winklerize=True, qval=1, external=True): self.qval = qval self.long_tolerance = long_tolerance self.winklerize = winklerize self.external = external def maximum(self, *sequences): return 1 def __call__(self, s1, s2, prefix_weight=0.1): s1, s2 = self._get_sequences(s1, s2) result = self.quick_answer(s1, s2) if result is not None: return result s1_len = len(s1) s2_len = len(s2) if not s1_len or not s2_len: return 0.0 min_len = min(s1_len, s2_len) search_range = max(s1_len, s2_len) search_range = (search_range // 2) - 1 if search_range < 0: search_range = 0 s1_flags = [False] * s1_len s2_flags = [False] * s2_len # looking only within search range, count & flag matched pairs common_chars = 0 for i, s1_ch in enumerate(s1): low = max(0, i - search_range) hi = min(i + search_range, s2_len - 1) for j in range(low, hi + 1): if not s2_flags[j] and s2[j] == s1_ch: s1_flags[i] = s2_flags[j] = True common_chars += 1 break # short circuit if no characters match if not common_chars: return 0.0 # count transpositions k = trans_count = 0 for i, s1_f in enumerate(s1_flags): if s1_f: for j in range(k, s2_len): if s2_flags[j]: k = j + 1 break if s1[i] != s2[j]: trans_count += 1 trans_count //= 2 # adjust for similarities in nonmatched characters weight = common_chars / s1_len + common_chars / s2_len weight += (common_chars - trans_count) / common_chars weight /= 3 # stop to boost if strings are not similar if not self.winklerize: return weight if weight <= 0.7: return weight # winkler modification # adjust for up to first 4 chars in common j = min(min_len, 4) i = 0 while i < j and s1[i] == s2[i] and s1[i]: i += 1 if i: weight += i * prefix_weight * (1.0 - weight) # optionally adjust for long strings # after agreeing beginning chars, at least two or more must agree and # agreed characters must be > half of remaining characters if not self.long_tolerance or min_len <= 4: return weight if common_chars <= i + 1 or 2 * common_chars < min_len + i: return weight tmp = (common_chars - i - 1) / (s1_len + s2_len - i * 2 + 2) weight += (1.0 - weight) * tmp return weight class Jaro(JaroWinkler): def __init__(self, long_tolerance=False, qval=1, external=True): super().__init__( long_tolerance=long_tolerance, winklerize=False, qval=qval, external=external, ) class NeedlemanWunsch(_BaseSimilarity): """ Computes the Needleman-Wunsch measure between two strings. The Needleman-Wunsch generalizes the Levenshtein distance and considers global alignment between two strings. Specifically, it is computed by assigning a score to each alignment between two input strings and choosing the score of the best alignment, that is, the maximal score. An alignment between two strings is a set of correspondences between the characters of between them, allowing for gaps. https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm """ positive = False def __init__(self, gap_cost=1.0, sim_func=None, qval=1, external=True): self.qval = qval self.gap_cost = gap_cost if sim_func: self.sim_func = sim_func else: self.sim_func = self._ident self.external = external def minimum(self, *sequences): return -max(map(len, sequences)) * self.gap_cost def maximum(self, *sequences): return max(map(len, sequences)) def distance(self, *sequences): """Get distance between sequences """ return -1 * self.similarity(*sequences) def normalized_distance(self, *sequences): """Get distance from 0 to 1 """ minimum = self.minimum(*sequences) maximum = self.maximum(*sequences) if maximum == 0: return 0 return (self.distance(*sequences) - minimum) / (maximum - minimum) def normalized_similarity(self, *sequences): """Get distance from 0 to 1 """ minimum = self.minimum(*sequences) maximum = self.maximum(*sequences) if maximum == 0: return 1 return (self.similarity(*sequences) - minimum) / (maximum * 2) def __call__(self, s1, s2): if not numpy: raise ImportError('Please, install numpy for Needleman-Wunsch measure') s1, s2 = self._get_sequences(s1, s2) # result = self.quick_answer(s1, s2) # if result is not None: # return result * self.maximum(s1, s2) dist_mat = numpy.zeros( (len(s1) + 1, len(s2) + 1), dtype=float, ) # DP initialization for i in range(len(s1) + 1): dist_mat[i, 0] = -(i * self.gap_cost) # DP initialization for j in range(len(s2) + 1): dist_mat[0, j] = -(j * self.gap_cost) # Needleman-Wunsch DP calculation for i, c1 in enumerate(s1, 1): for j, c2 in enumerate(s2, 1): match = dist_mat[i - 1, j - 1] + self.sim_func(c1, c2) delete = dist_mat[i - 1, j] - self.gap_cost insert = dist_mat[i, j - 1] - self.gap_cost dist_mat[i, j] = max(match, delete, insert) return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1] class SmithWaterman(_BaseSimilarity): """ Computes the Smith-Waterman measure between two strings. The Smith-Waterman algorithm performs local sequence alignment; that is, for determining similar regions between two strings. Instead of looking at the total sequence, the Smith-Waterman algorithm compares segments of all possible lengths and optimizes the similarity measure. https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm https://github.com/Yomguithereal/talisman/blob/master/src/metrics/smith-waterman.js """ def __init__(self, gap_cost=1.0, sim_func=None, qval=1, external=True): self.qval = qval self.gap_cost = gap_cost self.sim_func = sim_func or self._ident self.external = external def maximum(self, *sequences): return min(map(len, sequences)) def __call__(self, s1, s2): if not numpy: raise ImportError('Please, install numpy for Smith-Waterman measure') s1, s2 = self._get_sequences(s1, s2) result = self.quick_answer(s1, s2) if result is not None: return result dist_mat = numpy.zeros( (len(s1) + 1, len(s2) + 1), dtype=float, ) for i, sc1 in enumerate(s1, start=1): for j, sc2 in enumerate(s2, start=1): # The score for substituting the letter a[i - 1] for b[j - 1]. # Generally low for mismatch, high for match. match = dist_mat[i - 1, j - 1] + self.sim_func(sc1, sc2) # The scores for for introducing extra letters in one of the strings # (or by symmetry, deleting them from the other). delete = dist_mat[i - 1, j] - self.gap_cost insert = dist_mat[i, j - 1] - self.gap_cost dist_mat[i, j] = max(0, match, delete, insert) return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1] class Gotoh(NeedlemanWunsch): """Gotoh score Gotoh's algorithm is essentially Needleman-Wunsch with affine gap penalties: https://www.cs.umd.edu/class/spring2003/cmsc838t/papers/gotoh1982.pdf """ def __init__(self, gap_open=1, gap_ext=0.4, sim_func=None, qval=1, external=True): self.qval = qval self.gap_open = gap_open self.gap_ext = gap_ext if sim_func: self.sim_func = sim_func else: self.sim_func = self._ident self.external = external def minimum(self, *sequences): return -min(map(len, sequences)) def maximum(self, *sequences): return min(map(len, sequences)) def __call__(self, s1, s2): if not numpy: raise ImportError('Please, install numpy for Gotoh measure') s1, s2 = self._get_sequences(s1, s2) # result = self.quick_answer(s1, s2) # if result is not None: # return result * self.maximum(s1, s2) len_s1 = len(s1) len_s2 = len(s2) d_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float) p_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float) q_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=float) d_mat[0, 0] = 0 p_mat[0, 0] = float('-inf') q_mat[0, 0] = float('-inf') for i in range(1, len_s1 + 1): d_mat[i, 0] = float('-inf') p_mat[i, 0] = -self.gap_open - self.gap_ext * (i - 1) q_mat[i, 0] = float('-inf') q_mat[i, 1] = -self.gap_open for j in range(1, len_s2 + 1): d_mat[0, j] = float('-inf') p_mat[0, j] = float('-inf') p_mat[1, j] = -self.gap_open q_mat[0, j] = -self.gap_open - self.gap_ext * (j - 1) for i, sc1 in enumerate(s1, start=1): for j, sc2 in enumerate(s2, start=1): sim_val = self.sim_func(sc1, sc2) d_mat[i, j] = max( d_mat[i - 1, j - 1] + sim_val, p_mat[i - 1, j - 1] + sim_val, q_mat[i - 1, j - 1] + sim_val, ) p_mat[i, j] = max( d_mat[i - 1, j] - self.gap_open, p_mat[i - 1, j] - self.gap_ext, ) q_mat[i, j] = max( d_mat[i, j - 1] - self.gap_open, q_mat[i, j - 1] - self.gap_ext, ) i, j = (n - 1 for n in d_mat.shape) return max(d_mat[i, j], p_mat[i, j], q_mat[i, j]) class StrCmp95(_BaseSimilarity): """strcmp95 similarity http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c """ sp_mx = ( ('A', 'E'), ('A', 'I'), ('A', 'O'), ('A', 'U'), ('B', 'V'), ('E', 'I'), ('E', 'O'), ('E', 'U'), ('I', 'O'), ('I', 'U'), ('O', 'U'), ('I', 'Y'), ('E', 'Y'), ('C', 'G'), ('E', 'F'), ('W', 'U'), ('W', 'V'), ('X', 'K'), ('S', 'Z'), ('X', 'S'), ('Q', 'C'), ('U', 'V'), ('M', 'N'), ('L', 'I'), ('Q', 'O'), ('P', 'R'), ('I', 'J'), ('2', 'Z'), ('5', 'S'), ('8', 'B'), ('1', 'I'), ('1', 'L'), ('0', 'O'), ('0', 'Q'), ('C', 'K'), ('G', 'J'), ) def __init__(self, long_strings=False, external=True): self.long_strings = long_strings self.external = external def maximum(self, *sequences): return 1 @staticmethod def _in_range(char): return 0 < ord(char) < 91 def __call__(self, s1, s2): s1 = s1.strip().upper() s2 = s2.strip().upper() result = self.quick_answer(s1, s2) if result is not None: return result len_s1 = len(s1) len_s2 = len(s2) adjwt = defaultdict(int) # Initialize the adjwt array on the first call to the function only. # The adjwt array is used to give partial credit for characters that # may be errors due to known phonetic or character recognition errors. # A typical example is to match the letter "O" with the number "0" for c1, c2 in self.sp_mx: adjwt[c1, c2] = 3 adjwt[c2, c1] = 3 if len_s1 > len_s2: search_range = len_s1 minv = len_s2 else: search_range = len_s2 minv = len_s1 # Blank out the flags s1_flag = [0] * search_range s2_flag = [0] * search_range search_range = max(0, search_range // 2 - 1) # Looking only within the search range, count and flag the matched pairs. num_com = 0 yl1 = len_s2 - 1 for i, sc1 in enumerate(s1): lowlim = max(i - search_range, 0) hilim = min(i + search_range, yl1) for j in range(lowlim, hilim + 1): if s2_flag[j] == 0 and s2[j] == sc1: s2_flag[j] = 1 s1_flag[i] = 1 num_com += 1 break # If no characters in common - return if num_com == 0: return 0.0 # Count the number of transpositions k = n_trans = 0 for i, sc1 in enumerate(s1): if not s1_flag[i]: continue for j in range(k, len_s2): if s2_flag[j] != 0: k = j + 1 break if sc1 != s2[j]: n_trans += 1 n_trans = n_trans // 2 # Adjust for similarities in unmatched characters n_simi = 0 if minv > num_com: for i in range(len_s1): if s1_flag[i] != 0: continue if not self._in_range(s1[i]): continue for j in range(len_s2): if s2_flag[j] != 0: continue if not self._in_range(s2[j]): continue if (s1[i], s2[j]) not in adjwt: continue n_simi += adjwt[s1[i], s2[j]] s2_flag[j] = 2 break num_sim = n_simi / 10.0 + num_com # Main weight computation weight = num_sim / len_s1 + num_sim / len_s2 weight += (num_com - n_trans) / num_com weight = weight / 3.0 # Continue to boost the weight if the strings are similar if weight <= 0.7: return weight # Adjust for having up to the first 4 characters in common j = min(minv, 4) i = 0 for sc1, sc2 in zip(s1, s2): if i >= j: break if sc1 != sc2: break if sc1.isdigit(): break i += 1 if i: weight += i * 0.1 * (1.0 - weight) # Optionally adjust for long strings. # After agreeing beginning chars, at least two more must agree and # the agreeing characters must be > .5 of remaining characters. if not self.long_strings: return weight if minv <= 4: return weight if num_com <= i + 1 or 2 * num_com < minv + i: return weight if s1[0].isdigit(): return weight res = (num_com - i - 1) / (len_s1 + len_s2 - i * 2 + 2) weight += (1.0 - weight) * res return weight class MLIPNS(_BaseSimilarity): """ Compute the Hamming distance between the two or more sequences. The Hamming distance is the number of differing items in ordered sequences. http://www.sial.iias.spb.su/files/386-386-1-PB.pdf https://github.com/Yomguithereal/talisman/blob/master/src/metrics/mlipns.js """ def __init__(self, threshold=0.25, maxmismatches=2, qval=1, external=True): self.qval = qval self.threshold = threshold self.maxmismatches = maxmismatches self.external = external def maximum(self, *sequences): return 1 def __call__(self, *sequences): sequences = self._get_sequences(*sequences) result = self.quick_answer(*sequences) if result is not None: return result mismatches = 0 ham = Hamming()(*sequences) maxlen = max(map(len, sequences)) while all(sequences) and mismatches <= self.maxmismatches: if not maxlen: return 1 if 1 - (maxlen - ham) / maxlen <= self.threshold: return 1 mismatches += 1 ham -= 1 maxlen -= 1 if not maxlen: return 1 return 0 hamming = Hamming() levenshtein = Levenshtein() damerau = damerau_levenshtein = DamerauLevenshtein() jaro = Jaro() jaro_winkler = JaroWinkler() needleman_wunsch = NeedlemanWunsch() smith_waterman = SmithWaterman() gotoh = Gotoh() strcmp95 = StrCmp95() mlipns = MLIPNS() textdistance-4.2.2/textdistance/algorithms/phonetic.py000066400000000000000000000126321413625033700232610ustar00rootroot00000000000000# built-in from collections import defaultdict from itertools import groupby # app from .base import Base as _Base, BaseSimilarity as _BaseSimilarity try: from itertools import zip_longest except ImportError: from itertools import izip_longest as zip_longest try: import numpy except ImportError: numpy = None __all__ = [ 'MRA', 'Editex', 'mra', 'editex', ] class MRA(_BaseSimilarity): """Western Airlines Surname Match Rating Algorithm comparison rating https://en.wikipedia.org/wiki/Match_rating_approach https://github.com/Yomguithereal/talisman/blob/master/src/metrics/mra.js """ def maximum(self, *sequences): sequences = [list(self._calc_mra(s)) for s in sequences] return max(map(len, sequences)) def _calc_mra(self, word): if not word: return word word = word.upper() word = word[0] + ''.join(c for c in word[1:] if c not in 'AEIOU') # remove repeats like an UNIX uniq word = ''.join(char for char, _ in groupby(word)) if len(word) > 6: return word[:3] + word[-3:] return word def __call__(self, *sequences): if not all(sequences): return 0 sequences = [list(self._calc_mra(s)) for s in sequences] lengths = list(map(len, sequences)) count = len(lengths) max_length = max(lengths) if abs(max_length - min(lengths)) > count: return 0 for _ in range(count): new_sequences = [] minlen = min(lengths) for chars in zip(*sequences): if not self._ident(*chars): new_sequences.append(chars) new_sequences = map(list, zip(*new_sequences)) # update sequences ss = zip_longest(new_sequences, sequences, fillvalue=list()) sequences = [s1 + s2[minlen:] for s1, s2 in ss] # update lengths lengths = list(map(len, sequences)) if not lengths: return max_length return max_length - max(lengths) class Editex(_Base): """ https://anhaidgroup.github.io/py_stringmatching/v0.3.x/Editex.html http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.14.3856&rep=rep1&type=pdf http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.18.2138&rep=rep1&type=pdf https://github.com/chrislit/blob/master/abydos/distance/_editex.py https://habr.com/ru/post/331174/ (RUS) """ groups = ( frozenset('AEIOUY'), frozenset('BP'), frozenset('CKQ'), frozenset('DT'), frozenset('LR'), frozenset('MN'), frozenset('GJ'), frozenset('FPV'), frozenset('SXZ'), frozenset('CSZ'), ) ungrouped = frozenset('HW') # all letters in alphabet that not presented in `grouped` def __init__(self, local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True): self.match_cost = match_cost self.group_cost = group_cost self.mismatch_cost = mismatch_cost self.local = local self.external = external if groups is not None: if ungrouped is None: raise ValueError('`ungrouped` argument required with `groups`') self.groups = groups self.ungrouped = ungrouped self.grouped = frozenset.union(*self.groups) # backward compat if hasattr(self, 'letter_groups'): self.groups = self.letter_groups def maximum(self, *sequences): return max(map(len, sequences)) * self.mismatch_cost def r_cost(self, *elements): if self._ident(*elements): return self.match_cost if any(map(lambda x: x not in self.grouped, elements)): return self.mismatch_cost for group in self.groups: if all(map(lambda x: x in group, elements)): return self.group_cost return self.mismatch_cost def d_cost(self, *elements): if not self._ident(*elements) and elements[0] in self.ungrouped: return self.group_cost return self.r_cost(*elements) def __call__(self, s1, s2): result = self.quick_answer(s1, s2) if result is not None: return result # must do `upper` before getting length because some one-char lowercase glyphs # are represented as two chars in uppercase. s1 = ' ' + s1.upper() s2 = ' ' + s2.upper() len_s1 = len(s1) - 1 len_s2 = len(s2) - 1 if numpy: d_mat = numpy.zeros((len_s1 + 1, len_s2 + 1), dtype=int) else: d_mat = defaultdict(lambda: defaultdict(int)) if not self.local: for i in range(1, len_s1 + 1): d_mat[i][0] = d_mat[i - 1][0] + self.d_cost(s1[i - 1], s1[i]) for j in range(1, len_s2 + 1): d_mat[0][j] = d_mat[0][j - 1] + self.d_cost(s2[j - 1], s2[j]) for i, (cs1_prev, cs1_curr) in enumerate(zip(s1, s1[1:]), start=1): for j, (cs2_prev, cs2_curr) in enumerate(zip(s2, s2[1:]), start=1): d_mat[i][j] = min( d_mat[i - 1][j] + self.d_cost(cs1_prev, cs1_curr), d_mat[i][j - 1] + self.d_cost(cs2_prev, cs2_curr), d_mat[i - 1][j - 1] + self.r_cost(cs1_curr, cs2_curr), ) return d_mat[len_s1][len_s2] mra = MRA() editex = Editex() textdistance-4.2.2/textdistance/algorithms/sequence_based.py000066400000000000000000000132141413625033700244130ustar00rootroot00000000000000# built-in from difflib import SequenceMatcher as _SequenceMatcher # app from ..utils import find_ngrams from .base import BaseSimilarity as _BaseSimilarity try: import numpy except ImportError: from array import array numpy = None __all__ = [ 'lcsseq', 'lcsstr', 'ratcliff_obershelp', 'LCSSeq', 'LCSStr', 'RatcliffObershelp', ] class LCSSeq(_BaseSimilarity): """longest common subsequence similarity https://en.wikipedia.org/wiki/Longest_common_subsequence_problem """ def __init__(self, qval=1, test_func=None, external=True): self.qval = qval self.test_func = test_func or self._ident self.external = external def _dynamic(self, seq1, seq2): """ https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8 """ if numpy: lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=int) else: lengths = [array('L', [0] * (len(seq2) + 1)) for _ in range(len(seq1) + 1)] # row 0 and column 0 are initialized to 0 already for i, char1 in enumerate(seq1): for j, char2 in enumerate(seq2): if char1 == char2: lengths[i + 1][j + 1] = lengths[i][j] + 1 else: lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1]) # read the substring out from the matrix result = '' i, j = len(seq1), len(seq2) while i != 0 and j != 0: if lengths[i][j] == lengths[i - 1][j]: i -= 1 elif lengths[i][j] == lengths[i][j - 1]: j -= 1 else: assert seq1[i - 1] == seq2[j - 1] result = seq1[i - 1] + result i -= 1 j -= 1 return result def _recursive(self, *sequences): if not all(sequences): return type(sequences[0])() # empty sequence if self.test_func(*[s[-1] for s in sequences]): c = sequences[0][-1] sequences = [s[:-1] for s in sequences] return self(*sequences) + c m = type(sequences[0])() # empty sequence for i, s in enumerate(sequences): ss = sequences[:i] + (s[:-1], ) + sequences[i + 1:] m = max([self(*ss), m], key=len) return m def __call__(self, *sequences): if not sequences: return '' sequences = self._get_sequences(*sequences) if len(sequences) == 2: return self._dynamic(*sequences) else: return self._recursive(*sequences) def similarity(self, *sequences): return len(self(*sequences)) class LCSStr(_BaseSimilarity): """longest common substring similarity """ def _standart(self, s1, s2): matcher = _SequenceMatcher(a=s1, b=s2) match = matcher.find_longest_match(0, len(s1), 0, len(s2)) return s1[match.a: match.a + match.size] def _custom(self, *sequences): short = min(sequences, key=len) length = len(short) for n in range(length, 0, -1): for subseq in find_ngrams(short, n): subseq = ''.join(subseq) for seq in sequences: if subseq not in seq: break else: return subseq return type(short)() # empty sequence def __call__(self, *sequences): if not all(sequences): return '' length = len(sequences) if length == 0: return '' if length == 1: return sequences[0] sequences = self._get_sequences(*sequences) if length == 2 and max(map(len, sequences)) < 200: return self._standart(*sequences) return self._custom(*sequences) def similarity(self, *sequences): return len(self(*sequences)) class RatcliffObershelp(_BaseSimilarity): """Ratcliff-Obershelp similarity This follows the Ratcliff-Obershelp algorithm to derive a similarity measure: 1. Find the length of the longest common substring in sequences. 2. Recurse on the strings to the left & right of each this substring in sequences. The base case is a 0 length common substring, in which case, return 0. Otherwise, return the sum of the current longest common substring and the left & right recursed sums. 3. Multiply this length by 2 and divide by the sum of the lengths of sequences. https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching https://github.com/Yomguithereal/talisman/blob/master/src/metrics/ratcliff-obershelp.js https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html """ def maximum(self, *sequences): return 1 def _find(self, *sequences): subseq = LCSStr()(*sequences) length = len(subseq) if length == 0: return 0 before = [s[:s.find(subseq)] for s in sequences] after = [s[s.find(subseq) + length:] for s in sequences] return self._find(*before) + length + self._find(*after) def __call__(self, *sequences): result = self.quick_answer(*sequences) if result is not None: return result scount = len(sequences) # sequences count ecount = sum(map(len, sequences)) # elements count sequences = self._get_sequences(*sequences) return scount * self._find(*sequences) / ecount lcsseq = LCSSeq() lcsstr = LCSStr() ratcliff_obershelp = RatcliffObershelp() textdistance-4.2.2/textdistance/algorithms/simple.py000066400000000000000000000055001413625033700227350ustar00rootroot00000000000000# built-in from itertools import takewhile # app from .base import Base as _Base, BaseSimilarity as _BaseSimilarity __all__ = [ 'Prefix', 'Postfix', 'Length', 'Identity', 'Matrix', 'prefix', 'postfix', 'length', 'identity', 'matrix', ] class Prefix(_BaseSimilarity): """prefix similarity """ def __init__(self, qval=1, sim_test=None): self.qval = qval self.sim_test = sim_test or self._ident def __call__(self, *sequences): if not sequences: return 0 sequences = self._get_sequences(*sequences) test = lambda seq: self.sim_test(*seq) # noQA result = [c[0] for c in takewhile(test, zip(*sequences))] s = sequences[0] if isinstance(s, str): return ''.join(result) if isinstance(s, bytes): return b''.join(result) return result def similarity(self, *sequences): return len(self(*sequences)) class Postfix(Prefix): """postfix similarity """ def __call__(self, *sequences): s = sequences[0] sequences = [reversed(s) for s in sequences] result = reversed(super().__call__(*sequences)) if isinstance(s, str): return ''.join(result) if isinstance(s, bytes): return b''.join(result) return list(result) class Length(_Base): """Length distance """ def __call__(self, *sequences): lengths = list(map(len, sequences)) return max(lengths) - min(lengths) class Identity(_BaseSimilarity): """Identity similarity """ def maximum(self, *sequences): return 1 def __call__(self, *sequences): return int(self._ident(*sequences)) class Matrix(_BaseSimilarity): """Matrix similarity """ def __init__(self, mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True): self.mat = mat self.mismatch_cost = mismatch_cost self.match_cost = match_cost self.symmetric = symmetric # self.alphabet = sum(mat.keys(), ()) def maximum(self, *sequences): return self.match_cost def __call__(self, *sequences): if not self.mat: if self._ident(*sequences): return self.match_cost return self.mismatch_cost # search in matrix if sequences in self.mat: return self.mat[sequences] # search in symmetric matrix if self.symmetric: sequences = tuple(reversed(sequences)) if sequences in self.mat: return self.mat[sequences] # if identity then return match_cost if self._ident(*sequences): return self.match_cost # not found return self.mismatch_cost prefix = Prefix() postfix = Postfix() length = Length() identity = Identity() matrix = Matrix() textdistance-4.2.2/textdistance/algorithms/token_based.py000066400000000000000000000206701413625033700237270ustar00rootroot00000000000000# built-in from functools import reduce from itertools import islice, permutations, repeat from math import log # app from .base import Base as _Base, BaseSimilarity as _BaseSimilarity from .edit_based import DamerauLevenshtein __all__ = [ 'Jaccard', 'Sorensen', 'Tversky', 'Overlap', 'Cosine', 'Tanimoto', 'MongeElkan', 'Bag', 'jaccard', 'sorensen', 'tversky', 'sorensen_dice', 'overlap', 'cosine', 'tanimoto', 'monge_elkan', 'bag', ] class Jaccard(_BaseSimilarity): """ Compute the Jaccard similarity between the two sequences. They should contain hashable items. The return value is a float between 0 and 1, where 1 means equal, and 0 totally different. https://en.wikipedia.org/wiki/Jaccard_index https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaccard.js """ def __init__(self, qval=1, as_set=False, external=True): self.qval = qval self.as_set = as_set self.external = external def maximum(self, *sequences): return 1 def __call__(self, *sequences): result = self.quick_answer(*sequences) if result is not None: return result sequences = self._get_counters(*sequences) # sets intersection = self._intersect_counters(*sequences) # set intersection = self._count_counters(intersection) # int union = self._union_counters(*sequences) # set union = self._count_counters(union) # int return intersection / union class Sorensen(_BaseSimilarity): """ Compute the Sorensen distance between the two sequences. They should contain hashable items. The return value is a float between 0 and 1, where 0 means equal, and 1 totally different. https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient https://github.com/Yomguithereal/talisman/blob/master/src/metrics/dice.js """ def __init__(self, qval=1, as_set=False, external=True): self.qval = qval self.as_set = as_set self.external = external def maximum(self, *sequences): return 1 def __call__(self, *sequences): result = self.quick_answer(*sequences) if result is not None: return result sequences = self._get_counters(*sequences) # sets count = sum(self._count_counters(s) for s in sequences) intersection = self._intersect_counters(*sequences) # set intersection = self._count_counters(intersection) # int return 2.0 * intersection / count class Tversky(_BaseSimilarity): """Tversky index https://en.wikipedia.org/wiki/Tversky_index https://github.com/Yomguithereal/talisman/blob/master/src/metrics/tversky.js """ def __init__(self, qval=1, ks=None, bias=None, as_set=False, external=True): self.qval = qval self.ks = ks or repeat(1) self.bias = bias self.as_set = as_set self.external = external def maximum(self, *sequences): return 1 def __call__(self, *sequences): result = self.quick_answer(*sequences) if result is not None: return result sequences = self._get_counters(*sequences) # sets intersection = self._intersect_counters(*sequences) # set intersection = self._count_counters(intersection) # int sequences = [self._count_counters(s) for s in sequences] # ints ks = list(islice(self.ks, len(sequences))) if len(sequences) == 2 or self.bias is None: result = intersection for k, s in zip(ks, sequences): result += k * (s - intersection) return intersection / result s1, s2 = sequences alpha, beta = ks a_val = min([s1, s2]) b_val = max([s1, s2]) c_val = intersection + self.bias result = alpha * beta * (a_val - b_val) + b_val * beta return c_val / (result + c_val) class Overlap(_BaseSimilarity): """overlap coefficient https://en.wikipedia.org/wiki/Overlap_coefficient https://github.com/Yomguithereal/talisman/blob/master/src/metrics/overlap.js """ def __init__(self, qval=1, as_set=False, external=True): self.qval = qval self.as_set = as_set self.external = external def maximum(self, *sequences): return 1 def __call__(self, *sequences): result = self.quick_answer(*sequences) if result is not None: return result sequences = self._get_counters(*sequences) # sets intersection = self._intersect_counters(*sequences) # set intersection = self._count_counters(intersection) # int sequences = [self._count_counters(s) for s in sequences] # ints return intersection / min(sequences) class Cosine(_BaseSimilarity): """cosine similarity (Ochiai coefficient) https://en.wikipedia.org/wiki/Cosine_similarity https://github.com/Yomguithereal/talisman/blob/master/src/metrics/cosine.js """ def __init__(self, qval=1, as_set=False, external=True): self.qval = qval self.as_set = as_set self.external = external def maximum(self, *sequences): return 1 def __call__(self, *sequences): result = self.quick_answer(*sequences) if result is not None: return result sequences = self._get_counters(*sequences) # sets intersection = self._intersect_counters(*sequences) # set intersection = self._count_counters(intersection) # int sequences = [self._count_counters(s) for s in sequences] # ints prod = reduce(lambda x, y: x * y, sequences) return intersection / pow(prod, 1.0 / len(sequences)) class Tanimoto(Jaccard): """Tanimoto distance This is identical to the Jaccard similarity coefficient and the Tversky index for alpha=1 and beta=1. """ def __call__(self, *sequences): result = super().__call__(*sequences) if result == 0: return float('-inf') else: return log(result, 2) class MongeElkan(_BaseSimilarity): """ https://www.academia.edu/200314/Generalized_Monge-Elkan_Method_for_Approximate_Text_String_Comparison http://www.cs.cmu.edu/~wcohen/postscript/kdd-2003-match-ws.pdf https://github.com/Yomguithereal/talisman/blob/master/src/metrics/monge-elkan.js """ _damerau_levenshtein = DamerauLevenshtein() def __init__(self, algorithm=_damerau_levenshtein, symmetric=False, qval=1, external=True): self.algorithm = algorithm self.symmetric = symmetric self.qval = qval self.external = external def maximum(self, *sequences): result = self.algorithm.maximum(sequences) for seq in sequences: if seq: result = max(result, self.algorithm.maximum(*seq)) return result def _calc(self, seq, *sequences): if not seq: return 0 maxes = [] for c1 in seq: for s in sequences: max_sim = float('-inf') for c2 in s: max_sim = max(max_sim, self.algorithm.similarity(c1, c2)) maxes.append(max_sim) return sum(maxes) / len(seq) / len(maxes) def __call__(self, *sequences): result = self.quick_answer(*sequences) if result is not None: return result sequences = self._get_sequences(*sequences) if self.symmetric: result = [] for seqs in permutations(sequences): result.append(self._calc(*seqs)) return sum(result) / len(result) else: return self._calc(*sequences) class Bag(_Base): """Bag distance https://github.com/Yomguithereal/talisman/blob/master/src/metrics/bag.js """ def __call__(self, *sequences): sequences = self._get_counters(*sequences) # sets intersection = self._intersect_counters(*sequences) # set sequences = (self._count_counters(sequence - intersection) for sequence in sequences) # ^ ints return max(sequences) bag = Bag() cosine = Cosine() dice = Sorensen() jaccard = Jaccard() monge_elkan = MongeElkan() overlap = Overlap() sorensen = Sorensen() sorensen_dice = Sorensen() # sorensen_dice = Tversky(ks=[.5, .5]) tanimoto = Tanimoto() tversky = Tversky() textdistance-4.2.2/textdistance/algorithms/vector_based.py000066400000000000000000000051531413625033700241100ustar00rootroot00000000000000""" IMPORTANT: it's just draft """ # built-in from functools import reduce # app from .base import Base as _Base, BaseSimilarity as _BaseSimilarity try: import numpy except ImportError: numpy = None class Chebyshev(_Base): def _numpy(self, s1, s2): s1, s2 = numpy.asarray(s1), numpy.asarray(s2) return max(abs(s1 - s2)) def _pure(self, s1, s2): return max(abs(e1 - e2) for e1, e2 in zip(s1, s2)) def __call__(self, s1, s2): if numpy: return self._numpy(s1, s2) else: return self._pure(s1, s2) class Minkowski(_Base): def __init__(self, p=1, weight=1): if p < 1: raise ValueError('p must be at least 1') self.p = p self.weight = weight def _numpy(self, s1, s2): s1, s2 = numpy.asarray(s1), numpy.asarray(s2) result = (self.weight * abs(s1 - s2)) ** self.p return result.sum() ** (1.0 / self.p) def _pure(self, s1, s2): result = (self.weight * abs(e1 - e2) for e1, e2 in zip(s1, s2)) result = sum(e ** self.p for e in result) return result ** (1.0 / self.p) def __call__(self, s1, s2): if numpy: return self._numpy(s1, s2) else: return self._pure(s1, s2) class Manhattan(_Base): def __call__(self, s1, s2): raise NotImplementedError class Euclidean(_Base): def __init__(self, squared=False): self.squared = squared def _numpy(self, s1, s2): s1 = numpy.asarray(s1) s2 = numpy.asarray(s2) q = numpy.matrix(s1 - s2) result = (q * q.T).sum() if self.squared: return result return numpy.sqrt(result) def _pure(self, s1, s2): raise NotImplementedError def __call__(self, s1, s2): if numpy: return self._numpy(s1, s2) else: return self._pure(s1, s2) class Mahalanobis(_Base): def __call__(self, s1, s2): raise NotImplementedError class Correlation(_BaseSimilarity): def _numpy(self, *sequences): sequences = [numpy.asarray(s) for s in sequences] ssm = [s - s.mean() for s in sequences] result = reduce(numpy.dot, sequences) for sm in ssm: result /= numpy.sqrt(numpy.dot(sm, sm)) return result def _pure(self, *sequences): raise NotImplementedError def __call__(self, *sequences): if numpy: return self._numpy(*sequences) else: return self._pure(*sequences) class Kulsinski(_BaseSimilarity): def __call__(self, s1, s2): raise NotImplementedError textdistance-4.2.2/textdistance/benchmark.py000066400000000000000000000065551413625033700212400ustar00rootroot00000000000000# built-in import json from collections import defaultdict, namedtuple from timeit import timeit # external from tabulate import tabulate # app from .libraries import LIBRARIES_FILE, prototype # python3 -m textdistance.benchmark libraries = prototype.clone() Lib = namedtuple('Lib', ['algorithm', 'library', 'function', 'time', 'presets']) EXTERNAL_SETUP = """ from {library} import {function} as func presets = {presets} if presets: func = func(presets) """ INTERNAL_SETUP = """ from textdistance import {} as cls func = cls(external=False) """ STMT = """ func('text', 'test') func('qwer', 'asdf') func('a' * 15, 'b' * 15) """ RUNS = 2000 class Benchmark: @staticmethod def get_installed(): for alg in libraries.get_algorithms(): for lib in libraries.get_libs(alg): # try load function if not lib.get_function(): continue # return library info yield Lib( algorithm=alg, library=lib.module_name, function=lib.func_name, time=float('Inf'), presets=lib.presets, ) @staticmethod def get_external_benchmark(installed): for lib in installed: yield lib._replace(time=timeit( stmt=STMT, setup=EXTERNAL_SETUP.format(**lib._asdict()), number=RUNS, )) @staticmethod def get_internal_benchmark(): for alg in libraries.get_algorithms(): yield Lib( algorithm=alg, library='**textdistance**', function=alg, time=timeit( stmt=STMT, setup=INTERNAL_SETUP.format(alg), number=RUNS, ), presets=None, ) @staticmethod def filter_benchmark(external, internal): limits = {i.algorithm: i.time for i in internal} return filter(lambda x: x.time < limits[x.algorithm], external) @staticmethod def get_table(data): table = tabulate( [tuple(i[:-1]) for i in data], headers=['algorithm', 'library', 'function', 'time'], tablefmt='orgtbl', ) table += '\nTotal: {} libs.\n\n'.format(len(data)) return table @staticmethod def save(libs): data = defaultdict(list) for lib in libs: data[lib.algorithm].append([lib.library, lib.function]) with open(LIBRARIES_FILE, 'w') as f: json.dump(obj=data, fp=f, indent=2, sort_keys=True) @classmethod def run(cls): print('# Installed libraries:\n') installed = list(cls.get_installed()) installed.sort() print(cls.get_table(installed)) print('# Benchmarks (with textdistance):\n') benchmark = list(cls.get_external_benchmark(installed)) benchmark_internal = list(cls.get_internal_benchmark()) benchmark += benchmark_internal benchmark.sort(key=lambda x: (x.algorithm, x.time)) print(cls.get_table(benchmark)) print('# Faster than textdistance:\n') benchmark = list(cls.filter_benchmark(benchmark, benchmark_internal)) print(cls.get_table(benchmark)) cls.save(benchmark) if __name__ == '__main__': Benchmark.run() textdistance-4.2.2/textdistance/libraries.json000066400000000000000000000021541413625033700215720ustar00rootroot00000000000000{ "DamerauLevenshtein": [ [ "jellyfish", "damerau_levenshtein_distance" ], [ "pyxdameraulevenshtein", "damerau_levenshtein_distance" ], [ "pylev", "damerau_levenshtein" ] ], "Hamming": [ [ "Levenshtein", "hamming" ], [ "jellyfish", "hamming_distance" ], [ "distance", "hamming" ], [ "abydos.distance", "Hamming" ] ], "Jaro": [ [ "Levenshtein", "jaro" ], [ "jellyfish", "jaro_similarity" ], [ "py_stringmatching.similarity_measure.jaro", "jaro" ] ], "JaroWinkler": [ [ "jellyfish", "jaro_winkler_similarity" ] ], "Levenshtein": [ [ "Levenshtein", "distance" ], [ "jellyfish", "levenshtein_distance" ], [ "py_stringmatching.similarity_measure.levenshtein", "levenshtein" ], [ "pylev", "levenshtein" ], [ "distance", "levenshtein" ], [ "abydos.distance", "Levenshtein" ] ] } textdistance-4.2.2/textdistance/libraries.py000066400000000000000000000141371413625033700212550ustar00rootroot00000000000000# built-in import json import os import os.path from collections import defaultdict from copy import deepcopy from importlib import import_module CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) LIBRARIES_FILE = os.path.join(CURRENT_DIR, 'libraries.json') class LibrariesManager: def __init__(self): self.libs = defaultdict(list) def register(self, alg, lib): """Register new lib """ self.libs[alg].append(lib) def optimize(self): """Sort algorithm implementations by speed. """ # load benchmarks results with open(LIBRARIES_FILE, 'r') as f: libs_data = json.load(f) # optimize for alg, libs_names in libs_data.items(): libs = self.get_libs(alg) if not libs: continue # drop slow libs self.libs[alg] = [lib for lib in libs if [lib.module_name, lib.func_name] in libs_names] # sort libs by speed self.libs[alg].sort(key=lambda lib: libs_names.index([lib.module_name, lib.func_name])) def get_algorithms(self): """Get list of available algorithms. """ return list(self.libs.keys()) def get_libs(self, alg): """Get libs list for algorithm """ if alg not in self.libs: return [] return self.libs[alg] def clone(self): """Clone library manager prototype """ obj = self.__class__() obj.libs = deepcopy(self.libs) return obj class LibraryBase: func = NotImplemented def __init__(self, module_name, func_name, attr=None, presets=None, conditions=None): self.module_name = module_name self.func_name = func_name self.presets = presets self.conditions = conditions self.attr = attr def check_conditions(self, obj, *sequences): # external libs can compare only 2 strings if len(sequences) != 2: return False if not self.conditions: return True for name, value in self.conditions.items(): if getattr(obj, name) != value: return False return True def prepare(self, *sequences): return sequences def get_function(self): if self.func is NotImplemented: # import module try: module = import_module(self.module_name) except ImportError: self.func = None return # get object from module if self.module_name == 'abydos.distance': # abydos now provides its functions as classes allowing for # various options; we stick with the defaults for our # object constructor - the distance metric method is # called dist_abs() (whereas dist() gives a normalised distance) obj = getattr(module, self.func_name)().dist_abs else: obj = getattr(module, self.func_name) # init class if self.presets is not None: obj = obj(**self.presets) # get needed attribute if self.attr: obj = getattr(obj, self.attr) self.func = obj return self.func def __str__(self): return '{}.{}'.format(self.module_name, self.func_name) class TextLibrary(LibraryBase): def check_conditions(self, obj, *sequences): if not super().check_conditions(obj, *sequences): return False # compare only by letters if getattr(obj, 'qval', 0) != 1: return False # every sequence must be string for seq in sequences: if type(seq) is not str: return False return True def prepare(self, *sequences): # convert list of letters to string if isinstance(sequences[0], (tuple, list)): sequences = list(map(lambda x: ''.join(x), sequences)) return sequences class SameLengthLibrary(LibraryBase): def check_conditions(self, obj, *sequences): if not super().check_conditions(obj, *sequences): return False # compare only same length iterators if min(map(len, sequences)) != max(map(len, sequences)): return False return True class SameLengthTextLibrary(SameLengthLibrary, TextLibrary): pass prototype = LibrariesManager() prototype.register('DamerauLevenshtein', LibraryBase('abydos.distance', 'DamerauLevenshtein')) prototype.register('DamerauLevenshtein', LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance')) prototype.register('DamerauLevenshtein', TextLibrary('jellyfish', 'damerau_levenshtein_distance')) prototype.register('Hamming', LibraryBase('abydos.distance', 'Hamming')) prototype.register('Hamming', SameLengthLibrary('distance', 'hamming')) prototype.register('Hamming', SameLengthTextLibrary('Levenshtein', 'hamming')) prototype.register('Hamming', TextLibrary('jellyfish', 'hamming_distance')) prototype.register('Jaro', TextLibrary('jellyfish', 'jaro_similarity')) # prototype.register('Jaro', TextLibrary('Levenshtein', 'jaro')) # prototype.register('Jaro', TextLibrary('py_stringmatching.similarity_measure.jaro', 'jaro')) # prototype.register('JaroWinkler', LibraryBase('py_stringmatching.similarity_measure.jaro_winkler', 'jaro_winkler')) prototype.register('JaroWinkler', TextLibrary('jellyfish', 'jaro_winkler_similarity', conditions=dict(winklerize=True))) # https://github.com/life4/textdistance/issues/39 # prototype.register('JaroWinkler', TextLibrary('Levenshtein', 'jaro_winkler', conditions=dict(winklerize=True))) prototype.register('Levenshtein', LibraryBase('abydos.distance', 'Levenshtein')) prototype.register('Levenshtein', LibraryBase('distance', 'levenshtein')) prototype.register('Levenshtein', LibraryBase('pylev', 'levenshtein')) prototype.register('Levenshtein', TextLibrary('jellyfish', 'levenshtein_distance')) prototype.register('Levenshtein', TextLibrary('Levenshtein', 'distance')) # prototype.register('Levenshtein', TextLibrary('py_stringmatching.similarity_measure.levenshtein', 'levenshtein')) textdistance-4.2.2/textdistance/utils.py000066400000000000000000000012501413625033700204310ustar00rootroot00000000000000# built-in from itertools import permutations, product __all__ = ['words_combinations', 'find_ngrams'] def words_combinations(f, *texts): m = float('Inf') # split by words texts = [t.split() for t in texts] # permutations texts = [permutations(words) for words in texts] # combinations for subtexts in product(*texts): if f.equality: words_min_cnt = len(min(subtexts, key=len)) subtexts = [t[:words_min_cnt] for t in subtexts] subtexts = [' '.join(t) for t in subtexts] m = min(m, f(*subtexts)) return m def find_ngrams(input_list, n): return list(zip(*[input_list[i:] for i in range(n)]))