pax_global_header00006660000000000000000000000064150414123660014514gustar00rootroot0000000000000052 comment=9abfb9b6d86e2f88213be06ee4da4c2c4f18cf38 normality-3.0.1/000077500000000000000000000000001504141236600135335ustar00rootroot00000000000000normality-3.0.1/.bumpversion.cfg000066400000000000000000000005031504141236600166410ustar00rootroot00000000000000[bumpversion] current_version = 3.0.1 tag_name = {new_version} commit = True tag = True [bumpversion:file:normality/__init__.py] search = __version__ = "{current_version}" replace = __version__ = "{new_version}" [bumpversion:file:pyproject.toml] search = version = "{current_version}" replace = version = "{new_version}" normality-3.0.1/.github/000077500000000000000000000000001504141236600150735ustar00rootroot00000000000000normality-3.0.1/.github/dependabot.yml000066400000000000000000000002201504141236600177150ustar00rootroot00000000000000version: 2 updates: - package-ecosystem: pip directory: "/" schedule: interval: daily time: "04:00" open-pull-requests-limit: 100 normality-3.0.1/.github/workflows/000077500000000000000000000000001504141236600171305ustar00rootroot00000000000000normality-3.0.1/.github/workflows/build.yml000066400000000000000000000020041504141236600207460ustar00rootroot00000000000000name: build on: [push] permissions: id-token: write jobs: python: runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 - name: Show ref run: | echo "$GITHUB_REF" - name: Set up Python uses: actions/setup-python@v1 with: python-version: '3.12' - name: Install dependencies env: DEBIAN_FRONTEND: noninteractive run: | sudo apt-get update -y -qq sudo apt-get install -y -qq libicu-dev pip install --upgrade pip wheel build twine hatchling pip install -e ".[dev]" - name: Validate mypy typing run: | make typecheck - name: Run unit tests run: | make test - name: Build a distribution run: | python3 -m build --wheel - name: Publish a Python distribution to PyPI if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@release/v1 with: skip-existing: true verbose: true normality-3.0.1/.gitignore000066400000000000000000000001051504141236600155170ustar00rootroot00000000000000*.egg-info *.pyc dist/* build/* .vscode/* .pytest_cache .mypy_cache/*normality-3.0.1/LICENSE000066400000000000000000000020741504141236600145430ustar00rootroot00000000000000Copyright (c) 2013-2025, Friedrich Lindenberg, Gregor Aisch Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. normality-3.0.1/MANIFEST.in000066400000000000000000000000741504141236600152720ustar00rootroot00000000000000include LICENSE include README.md include normality/py.typednormality-3.0.1/Makefile000066400000000000000000000005331504141236600151740ustar00rootroot00000000000000 all: clean test install: pip install -q '.[dev]' check: test typecheck test: pytest typecheck: mypy --strict normality clean: rm -rf dist build .eggs .mypy_cache .pytest_cache find . -name '*.egg-info' -exec rm -fr {} + find . -name '*.egg' -exec rm -f {} + find . -name '*.pyc' -exec rm -f {} + find . -name '*.pyo' -exec rm -f {} +normality-3.0.1/README.md000066400000000000000000000022431504141236600150130ustar00rootroot00000000000000# normality [![build](https://github.com/pudo/normality/actions/workflows/build.yml/badge.svg)](https://github.com/pudo/normality/actions/workflows/build.yml) Normality is a Python micro-package that contains a small set of text normalization functions for easier re-use. These functions accept a snippet of unicode or utf-8 encoded text and remove various classes of characters, such as diacritics, punctuation etc. This is useful as a preparation to further text analysis. **WARNING**: As of version 3.0, `normality` requires `pyicu` as a mandatory dependency. If you cannot install `pyicu`, consider using `normality < 3.0.0`. ## Example ```python # coding: utf-8 from normality import normalize, slugify, collapse_spaces, ascii_text, latinize_text text = normalize('Nie wieder "Grüne Süppchen" kochen!') assert text == 'nie wieder grune suppchen kochen' slug = slugify('My first blog post!') assert slug == 'my-first-blog-post' text = 'this \n\n\r\nhas\tlots of \nodd spacing.' assert collapse_spaces(text) == 'this has lots of odd spacing.' ``` ## License ``normality`` is open source, licensed under a standard MIT license (included in this repository as ``LICENSE``). normality-3.0.1/normality/000077500000000000000000000000001504141236600155515ustar00rootroot00000000000000normality-3.0.1/normality/__init__.py000066400000000000000000000063751504141236600176750ustar00rootroot00000000000000"""Helper functions for string cleaning. `normality` includes functions to convert arbitrary Python objects to strings, transliterate them into the latin alphabet, make slugs for URLs, or perform the substitution of characters based on unicode character categories. """ from typing import Any, Optional from normality.cleaning import collapse_spaces, squash_spaces, category_replace from normality.constants import UNICODE_CATEGORIES, WS from normality.transliteration import latinize_text, ascii_text from normality.encoding import guess_encoding, guess_file_encoding from normality.encoding import predict_encoding, predict_file_encoding from normality.encoding import DEFAULT_ENCODING from normality.stringify import stringify from normality.paths import safe_filename from normality.slugify import slugify, slugify_text from normality.util import Categories, Encoding __version__ = "3.0.1" __all__ = [ "collapse_spaces", "squash_spaces", "category_replace", "safe_filename", "normalize", "stringify", "slugify", "slugify_text", "guess_encoding", "guess_file_encoding", "predict_encoding", "predict_file_encoding", "latinize_text", "ascii_text", "WS", "UNICODE_CATEGORIES", "DEFAULT_ENCODING", ] def normalize( value: Any, lowercase: bool = True, collapse: bool = True, latinize: bool = False, ascii: bool = False, encoding_default: Encoding = DEFAULT_ENCODING, encoding: Optional[str] = None, replace_categories: Categories = UNICODE_CATEGORIES, ) -> Optional[str]: """The main normalization function for text. This will take a string and apply a set of transformations to it so that it can be processed more easily afterwards. Arguments: * ``lowercase``: not very mysterious. * ``collapse``: replace multiple whitespace-like characters with a single whitespace. This is especially useful with category replacement which can lead to a lot of whitespace. * ``decompose``: apply a unicode normalization (NFKD) to separate simple characters and their diacritics. * ``replace_categories``: This will perform a replacement of whole classes of unicode characters (e.g. symbols, marks, numbers) with a given character. It is used to replace any non-text elements of the input string. """ text = stringify(value, encoding_default=encoding_default, encoding=encoding) if text is None: return None if lowercase: # Yeah I made a Python package for this. text = text.lower() if ascii: # A stricter form of transliteration that leaves only ASCII # characters. text = ascii_text(text) elif latinize: # Perform unicode-based transliteration, e.g. of cyricllic # or CJK scripts into latin. text = latinize_text(text) # Perform unicode category-based character replacement. This is # used to filter out whole classes of characters, such as symbols, # punctuation, or whitespace-like characters. if replace_categories is not None: text = category_replace(text, replace_categories) if collapse: # Remove consecutive whitespace and strip text = squash_spaces(text) if len(text) == 0: return None return text normality-3.0.1/normality/cleaning.py000066400000000000000000000071641504141236600177130ustar00rootroot00000000000000import re import unicodedata from typing import Any, Optional import warnings from normality.constants import UNICODE_CATEGORIES, CONTROL_CODES, WS from normality.util import Categories, is_text COLLAPSE_RE = re.compile(r"[\s\u2028\u2029\u200b\u200c\u200d]+", re.U) BOM_RE = re.compile("^\ufeff", re.U) UNSAFE_RE = re.compile( r"^\ufeff|[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f\x80-\x9f\u2028\u2029\u200b\u200c\u200d]" ) QUOTES_RE = re.compile(r'^["\'](.*)["\']$') def decompose_nfkd(text: Any) -> Optional[str]: """Perform unicode compatibility decomposition. This will replace some non-standard value representations in unicode and normalise them, while also separating characters and their diacritics into two separate codepoints. """ if not is_text(text): return None return unicodedata.normalize("NFKD", text) def compose_nfc(text: Any) -> Optional[str]: """Perform unicode composition.""" if not is_text(text): return None return unicodedata.normalize("NFC", text) def compose_nfkc(text: Any) -> Optional[str]: """Perform unicode composition.""" if not is_text(text): return None return unicodedata.normalize("NFKC", text) def strip_quotes(text: str) -> Optional[str]: """Remove double or single quotes surrounding a string.""" if not is_text(text): warnings.warn( "normality.strip_quotes will stop handling None soon.", DeprecationWarning, stacklevel=2, ) return None return QUOTES_RE.sub("\\1", text) def category_replace(text: str, replacements: Categories = UNICODE_CATEGORIES) -> str: """Remove characters from a string based on unicode classes. This is a method for removing non-text characters (such as punctuation, whitespace, marks and diacritics) from a piece of text by class, rather than specifying them individually. """ text = unicodedata.normalize("NFKD", text) characters = [] for character in text: cat = unicodedata.category(character) replacement = replacements.get(cat, character) if replacement is not None: characters.append(replacement) return "".join(characters) def remove_control_chars(text: str) -> str: """Remove just the control codes from a piece of text.""" return category_replace(text, replacements=CONTROL_CODES) def remove_unsafe_chars(text: str) -> str: """Remove unsafe unicode characters from a piece of text.""" if text is None: warnings.warn( "normality.remove_unsafe_chars will stop handling None soon.", DeprecationWarning, stacklevel=2, ) return "" return UNSAFE_RE.sub("", text) def remove_byte_order_mark(text: str) -> str: """Remove a BOM from the beginning of the text.""" if text is None: warnings.warn( "normality.remove_byte_order_mark will stop handling None soon.", DeprecationWarning, stacklevel=2, ) return "" return BOM_RE.sub("", text) def collapse_spaces(text: str) -> Optional[str]: """Remove newlines, tabs and multiple spaces with single spaces.""" warnings.warn( "normality.collapse_spaces is deprecated, use normality.squash_spaces instead.", DeprecationWarning, stacklevel=2, ) # TODO: Remove in 3.1: if text is None: return None text = COLLAPSE_RE.sub(WS, text).strip(WS) if len(text) == 0: return None return text def squash_spaces(text: str) -> str: """Remove all whitespace characters from a piece of text.""" return COLLAPSE_RE.sub(WS, text).strip(WS) normality-3.0.1/normality/constants.py000066400000000000000000000031271504141236600201420ustar00rootroot00000000000000from normality.util import Categories # https://en.wikipedia.org/wiki/Cyrillic_script_in_Unicode # Cyrillic: U+0400–U+04FF, 256 characters # Cyrillic Supplement: U+0500–U+052F, 48 characters # Cyrillic Extended-A: U+2DE0–U+2DFF, 32 characters # Cyrillic Extended-B: U+A640–U+A69F, 96 characters # Cyrillic Extended-C: U+1C80–U+1C8F, 9 characters # Phonetic Extensions: U+1D2B, U+1D78, 2 Cyrillic characters # Combining Half Marks: U+FE2E–U+FE2F, 2 Cyrillic characters WS: str = " " # Unicode character classes, see: # http://www.fileformat.info/info/unicode/category/index.htm # https://en.wikipedia.org/wiki/Unicode_character_property # http://www.unicode.org/charts/beta/script/ UNICODE_CATEGORIES: Categories = { "Cc": WS, "Cf": None, "Cs": None, "Co": None, "Cn": None, "Lm": None, "Mn": None, "Mc": WS, "Me": None, "No": None, "Zs": WS, "Zl": WS, "Zp": WS, "Pc": WS, # TODO: figure out if this wants to be None "Pd": WS, "Ps": WS, "Pe": WS, "Pi": WS, "Pf": WS, "Po": WS, "Sm": WS, "Sc": None, "Sk": None, "So": WS, } SLUG_CATEGORIES: Categories = { "Cc": None, "Cf": None, "Cs": None, "Co": None, "Cn": None, # "Lm": None, # "Mn": None, "Mc": WS, "Me": None, "No": None, "Zs": WS, "Zl": WS, "Zp": WS, "Pc": WS, "Pd": WS, "Ps": WS, "Pe": WS, "Pi": WS, "Pf": WS, "Po": WS, "Sm": WS, "Sc": None, "Sk": None, "So": WS, } CONTROL_CODES: Categories = {"Cc": WS, "Cf": WS, "Cs": WS, "Co": WS, "Cn": WS, "Zl": WS} normality-3.0.1/normality/encoding.py000066400000000000000000000072551504141236600177220ustar00rootroot00000000000000import codecs import chardet import warnings from charset_normalizer import from_bytes, CharsetMatches from typing import Any, BinaryIO, TYPE_CHECKING from normality.util import Encoding if TYPE_CHECKING: from charset_normalizer import CharsetMatches DEFAULT_ENCODING = "utf-8" def normalize_encoding(encoding: str, default: Encoding = DEFAULT_ENCODING) -> str: """Normalize the encoding name, replace ASCII w/ UTF-8.""" warnings.warn( "normalize_encoding is now deprecated. Use tidy_encoding instead", DeprecationWarning, ) return tidy_encoding(encoding, default) def tidy_encoding(encoding: str, default: Encoding = DEFAULT_ENCODING) -> str: """Normalize the encoding name, replace ASCII w/ UTF-8.""" if encoding is None: return default encoding = encoding.strip() if encoding.lower() in ["", "ascii"]: return default try: codec = codecs.lookup(encoding) return codec.name except LookupError: return default def normalize_result( result: Any, default: Encoding, threshold: float = 0.2 ) -> Encoding: """Interpret a chardet result.""" warnings.warn( "normalize_result is now deprecated. Use tidy_result instead", DeprecationWarning, ) if result is None: return default confidence: float = result.get("confidence") if confidence is None: return default if float(confidence) < threshold: return default encoding: Encoding = result.get("encoding") if encoding is None: return default return normalize_encoding(encoding, default=default) def tidy_result(result: CharsetMatches, default: Encoding) -> Encoding: """Interpret a chardet result.""" res = result.best() if res is None: return default encoding: Encoding = res.encoding if encoding is None: return default return tidy_encoding(encoding, default=default) def guess_encoding(text: bytes, default: Encoding = DEFAULT_ENCODING) -> Encoding: """Guess string encoding. Given a piece of text, apply character encoding detection to guess the appropriate encoding of the text. """ warnings.warn( "guess_encoding is now deprecated. Use predict_encoding instead", DeprecationWarning, ) return predict_encoding(text, default=default) def predict_encoding(text: bytes, default: Encoding = DEFAULT_ENCODING) -> Encoding: """Guess string encoding. Given a piece of text, apply character encoding detection to guess the appropriate encoding of the text. """ result = from_bytes(text, explain=False) return tidy_result(result, default=default) def guess_file_encoding(fh: BinaryIO, default: Encoding = DEFAULT_ENCODING) -> Encoding: """Guess encoding from a file handle.""" warnings.warn( "guess_encoding is now deprecated. Use predict_encoding instead", DeprecationWarning, ) start = fh.tell() detector = chardet.UniversalDetector() while True: data = fh.read(1024 * 10) if not data: detector.close() break detector.feed(data) if detector.done: break fh.seek(start) return normalize_result(detector.result, default=default) def predict_file_encoding( fh: BinaryIO, default: Encoding = DEFAULT_ENCODING ) -> Encoding: """Guess encoding from a file handle.""" start = fh.tell() result: CharsetMatches = CharsetMatches() while True: data = fh.read(1024 * 10) if not data: break result = from_bytes(data, explain=False) if result: break fh.seek(start) return tidy_result(result, default=default) normality-3.0.1/normality/paths.py000066400000000000000000000031171504141236600172440ustar00rootroot00000000000000import os from typing import Optional from banal import decode_path from normality.stringify import stringify from normality.cleaning import squash_spaces, category_replace from normality.constants import UNICODE_CATEGORIES, WS from normality.transliteration import ascii_text MAX_LENGTH = 254 def _safe_name(file_name: Optional[str], sep: str) -> Optional[str]: """Convert the file name to ASCII and normalize the string.""" file_name = stringify(file_name) if file_name is None: return None file_name = ascii_text(file_name) file_name = category_replace(file_name, UNICODE_CATEGORIES) file_name = squash_spaces(file_name) file_name = file_name.replace(WS, sep).strip(sep) if len(file_name) == 0: return None return file_name def safe_filename( file_name: Optional[str], sep: str = "_", default: Optional[str] = None, extension: Optional[str] = None, ) -> Optional[str]: """Create a secure filename for plain file system storage.""" if file_name is None: return decode_path(default) file_name = decode_path(file_name) if file_name is None: return None file_name = os.path.basename(file_name) file_name, _extension = os.path.splitext(file_name) file_name = _safe_name(file_name, sep=sep) if file_name is None: return decode_path(default) file_name = file_name[:MAX_LENGTH] extension = _safe_name(extension or _extension, sep=sep) if extension is not None: file_name = ".".join((file_name, extension)) file_name = file_name[:MAX_LENGTH] return file_name normality-3.0.1/normality/py.typed000066400000000000000000000000001504141236600172360ustar00rootroot00000000000000normality-3.0.1/normality/scripts.py000066400000000000000000000511141504141236600176140ustar00rootroot00000000000000from typing import Tuple from functools import lru_cache ALPHABET = 1 LATIN = 2 CYRILLIC = 3 GREEK = 4 ARABIC = 5 CJK = 6 HANGUL = 7 ABJAD = 99 ABUGIDA = 100 SYLLABARY = 101 HISTORIC = 99999 FUNKY = 100000 UNKNOWN = 0 # Source: https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt UNICODE_BLOCKS: Tuple[Tuple[int, int, str, Tuple[int, ...]]] = ( # type: ignore ( 0x0000, 0x007F, "Basic Latin", ( ALPHABET, LATIN, ), ), ( 0x0080, 0x00FF, "Latin-1 Supplement", ( ALPHABET, LATIN, ), ), ( 0x0100, 0x017F, "Latin Extended-A", ( ALPHABET, LATIN, ), ), ( 0x0180, 0x024F, "Latin Extended-B", ( ALPHABET, LATIN, ), ), (0x0250, 0x02AF, "IPA Extensions", ()), (0x02B0, 0x02FF, "Spacing Modifier Letters", ()), (0x0300, 0x036F, "Combining Diacritical Marks", ()), ( 0x0370, 0x03FF, "Greek and Coptic", ( ALPHABET, GREEK, ), ), ( 0x0400, 0x04FF, "Cyrillic", ( ALPHABET, CYRILLIC, ), ), ( 0x0500, 0x052F, "Cyrillic Supplement", ( ALPHABET, CYRILLIC, ), ), (0x0530, 0x058F, "Armenian", (ALPHABET,)), (0x0590, 0x05FF, "Hebrew", (ABJAD,)), ( 0x0600, 0x06FF, "Arabic", ( ARABIC, ABJAD, ), ), (0x0700, 0x074F, "Syriac", (ABJAD,)), ( 0x0750, 0x077F, "Arabic Supplement", ( ARABIC, ABJAD, ), ), (0x0780, 0x07BF, "Thaana", (ABUGIDA,)), (0x07C0, 0x07FF, "NKo", (FUNKY,)), (0x0800, 0x083F, "Samaritan", (ABJAD,)), ( 0x0840, 0x085F, "Mandaic", ( ALPHABET, HISTORIC, ), ), (0x0860, 0x086F, "Syriac Supplement", (ABJAD,)), ( 0x0870, 0x089F, "Arabic Extended-B", ( ARABIC, ABJAD, ), ), ( 0x08A0, 0x08FF, "Arabic Extended-A", ( ARABIC, ABJAD, ), ), (0x0900, 0x097F, "Devanagari", (ABUGIDA,)), (0x0980, 0x09FF, "Bengali", (ABUGIDA,)), (0x0A00, 0x0A7F, "Gurmukhi", (ABUGIDA,)), (0x0A80, 0x0AFF, "Gujarati", (ABUGIDA,)), ( 0x0B00, 0x0B7F, "Oriya", ( ABUGIDA, HISTORIC, ), ), (0x0B80, 0x0BFF, "Tamil", (ABUGIDA,)), (0x0C00, 0x0C7F, "Telugu", (ABUGIDA,)), (0x0C80, 0x0CFF, "Kannada", (ABUGIDA,)), (0x0D00, 0x0D7F, "Malayalam", (ABUGIDA,)), (0x0D80, 0x0DFF, "Sinhala", (ABUGIDA,)), (0x0E00, 0x0E7F, "Thai", (ABUGIDA,)), (0x0E80, 0x0EFF, "Lao", (ABUGIDA,)), (0x0F00, 0x0FFF, "Tibetan", (ABUGIDA,)), (0x1000, 0x109F, "Myanmar", (ABUGIDA,)), (0x10A0, 0x10FF, "Georgian", (ALPHABET,)), ( 0x1100, 0x11FF, "Hangul Jamo", ( ALPHABET, HANGUL, ), ), (0x1200, 0x137F, "Ethiopic", (ABUGIDA,)), (0x1380, 0x139F, "Ethiopic Supplement", (ABUGIDA,)), (0x13A0, 0x13FF, "Cherokee", (SYLLABARY,)), (0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics", (SYLLABARY,)), ( 0x1680, 0x169F, "Ogham", ( ALPHABET, HISTORIC, ), ), ( 0x16A0, 0x16FF, "Runic", ( ALPHABET, HISTORIC, ), ), (0x1700, 0x171F, "Tagalog", (ABUGIDA,)), (0x1720, 0x173F, "Hanunoo", (ABUGIDA,)), (0x1740, 0x175F, "Buhid", (ABUGIDA,)), (0x1760, 0x177F, "Tagbanwa", (ABUGIDA,)), (0x1780, 0x17FF, "Khmer", (ABUGIDA,)), ( 0x1800, 0x18AF, "Mongolian", ( ALPHABET, FUNKY, ), ), (0x18B0, 0x18FF, "Unified Canadian Aboriginal Syllabics Extended", (SYLLABARY,)), (0x1900, 0x194F, "Limbu", (ABUGIDA,)), (0x1950, 0x197F, "Tai Le", (ABUGIDA,)), (0x1980, 0x19DF, "New Tai Lue", (ABUGIDA,)), (0x19E0, 0x19FF, "Khmer Symbols", (ABUGIDA,)), (0x1A00, 0x1A1F, "Buginese", (ABUGIDA,)), (0x1A20, 0x1AAF, "Tai Tham", (ABUGIDA,)), (0x1AB0, 0x1AFF, "Combining Diacritical Marks Extended", ()), (0x1B00, 0x1B7F, "Balinese", (ABUGIDA,)), (0x1B80, 0x1BBF, "Sundanese", (ABUGIDA,)), (0x1BC0, 0x1BFF, "Batak", (ABUGIDA,)), (0x1C00, 0x1C4F, "Lepcha", (ABUGIDA,)), ( 0x1C50, 0x1C7F, "Ol Chiki", ( ALPHABET, FUNKY, ), ), ( 0x1C80, 0x1C8F, "Cyrillic Extended-C", ( ALPHABET, CYRILLIC, ), ), (0x1C90, 0x1CBF, "Georgian Extended", (ALPHABET,)), (0x1CC0, 0x1CCF, "Sundanese Supplement", (ABUGIDA,)), (0x1CD0, 0x1CFF, "Vedic Extensions", ()), (0x1D00, 0x1D7F, "Phonetic Extensions", ()), (0x1D80, 0x1DBF, "Phonetic Extensions Supplement", ()), (0x1DC0, 0x1DFF, "Combining Diacritical Marks Supplement", ()), ( 0x1E00, 0x1EFF, "Latin Extended Additional", ( ALPHABET, LATIN, ), ), ( 0x1F00, 0x1FFF, "Greek Extended", ( ALPHABET, GREEK, ), ), (0x2000, 0x206F, "General Punctuation", ()), (0x2070, 0x209F, "Superscripts and Subscripts", ()), (0x20A0, 0x20CF, "Currency Symbols", ()), (0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols", ()), (0x2100, 0x214F, "Letterlike Symbols", ()), (0x2150, 0x218F, "Number Forms", ()), (0x2190, 0x21FF, "Arrows", ()), (0x2200, 0x22FF, "Mathematical Operators", ()), (0x2300, 0x23FF, "Miscellaneous Technical", ()), (0x2400, 0x243F, "Control Pictures", ()), (0x2440, 0x245F, "Optical Character Recognition", ()), (0x2460, 0x24FF, "Enclosed Alphanumerics", ()), (0x2500, 0x257F, "Box Drawing", ()), (0x2580, 0x259F, "Block Elements", ()), (0x25A0, 0x25FF, "Geometric Shapes", ()), (0x2600, 0x26FF, "Miscellaneous Symbols", ()), (0x2700, 0x27BF, "Dingbats", ()), (0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A", ()), (0x27F0, 0x27FF, "Supplemental Arrows-A", ()), (0x2800, 0x28FF, "Braille Patterns", ()), (0x2900, 0x297F, "Supplemental Arrows-B", ()), (0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B", ()), (0x2A00, 0x2AFF, "Supplemental Mathematical Operators", ()), (0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows", ()), ( 0x2C00, 0x2C5F, "Glagolitic", ( ALPHABET, HISTORIC, ), ), ( 0x2C60, 0x2C7F, "Latin Extended-C", ( ALPHABET, LATIN, ), ), ( 0x2C80, 0x2CFF, "Coptic", ( ALPHABET, HISTORIC, ), ), (0x2D00, 0x2D2F, "Georgian Supplement", (ALPHABET,)), (0x2D30, 0x2D7F, "Tifinagh", (ABJAD,)), (0x2D80, 0x2DDF, "Ethiopic Extended", (ABUGIDA,)), ( 0x2DE0, 0x2DFF, "Cyrillic Extended-A", ( ALPHABET, CYRILLIC, ), ), (0x2E00, 0x2E7F, "Supplemental Punctuation", ()), (0x2E80, 0x2EFF, "CJK Radicals Supplement", (CJK,)), (0x2F00, 0x2FDF, "Kangxi Radicals", (CJK,)), (0x2FF0, 0x2FFF, "Ideographic Description Characters", ()), (0x3000, 0x303F, "CJK Symbols and Punctuation", (CJK,)), (0x3040, 0x309F, "Hiragana", (CJK,)), (0x30A0, 0x30FF, "Katakana", (CJK,)), (0x3100, 0x312F, "Bopomofo", (CJK,)), ( 0x3130, 0x318F, "Hangul Compatibility Jamo", (HANGUL, CJK), ), (0x3190, 0x319F, "Kanbun", (CJK,)), (0x31A0, 0x31BF, "Bopomofo Extended", (CJK,)), (0x31C0, 0x31EF, "CJK Strokes", (CJK,)), (0x31F0, 0x31FF, "Katakana Phonetic Extensions", (CJK,)), (0x3200, 0x32FF, "Enclosed CJK Letters and Months", (CJK,)), (0x3300, 0x33FF, "CJK Compatibility", (CJK,)), (0x3400, 0x4DBF, "CJK Unified Ideographs Extension A", (CJK,)), (0x4DC0, 0x4DFF, "Yijing Hexagram Symbols", (CJK,)), (0x4E00, 0x9FFF, "CJK Unified Ideographs", (CJK,)), (0xA000, 0xA48F, "Yi Syllables", (SYLLABARY,)), (0xA490, 0xA4CF, "Yi Radicals", ()), (0xA4D0, 0xA4FF, "Lisu", (ABUGIDA,)), (0xA500, 0xA63F, "Vai", (SYLLABARY,)), ( 0xA640, 0xA69F, "Cyrillic Extended-B", ( ALPHABET, CYRILLIC, ), ), (0xA6A0, 0xA6FF, "Bamum", (SYLLABARY,)), (0xA700, 0xA71F, "Modifier Tone Letters", ()), ( 0xA720, 0xA7FF, "Latin Extended-D", ( ALPHABET, LATIN, ), ), (0xA800, 0xA82F, "Syloti Nagri", ()), (0xA830, 0xA83F, "Common Indic Number Forms", ()), (0xA840, 0xA87F, "Phags-pa", ()), (0xA880, 0xA8DF, "Saurashtra", ()), (0xA8E0, 0xA8FF, "Devanagari Extended", (ABUGIDA,)), (0xA900, 0xA92F, "Kayah Li", ()), (0xA930, 0xA95F, "Rejang", ()), (0xA960, 0xA97F, "Hangul Jamo Extended-A", (HANGUL,)), (0xA980, 0xA9DF, "Javanese", (ABUGIDA,)), (0xA9E0, 0xA9FF, "Myanmar Extended-B", (ABUGIDA,)), (0xAA00, 0xAA5F, "Cham", (ABUGIDA,)), (0xAA60, 0xAA7F, "Myanmar Extended-A", (ABUGIDA,)), (0xAA80, 0xAADF, "Tai Viet", ()), (0xAAE0, 0xAAFF, "Meetei Mayek Extensions", ()), (0xAB00, 0xAB2F, "Ethiopic Extended-A", (ABUGIDA,)), ( 0xAB30, 0xAB6F, "Latin Extended-E", ( ALPHABET, LATIN, ), ), (0xAB70, 0xABBF, "Cherokee Supplement", ()), (0xABC0, 0xABFF, "Meetei Mayek", ()), ( 0xAC00, 0xD7AF, "Hangul Syllables", (HANGUL, SYLLABARY), ), (0xD7B0, 0xD7FF, "Hangul Jamo Extended-B", (HANGUL,)), (0xD800, 0xDB7F, "High Surrogates", ()), (0xDB80, 0xDBFF, "High Private Use Surrogates", ()), (0xDC00, 0xDFFF, "Low Surrogates", ()), (0xE000, 0xF8FF, "Private Use Area", ()), (0xF900, 0xFAFF, "CJK Compatibility Ideographs", (CJK,)), (0xFB00, 0xFB4F, "Alphabetic Presentation Forms", ()), ( 0xFB50, 0xFDFF, "Arabic Presentation Forms-A", ( ARABIC, ABJAD, ), ), (0xFE00, 0xFE0F, "Variation Selectors", ()), (0xFE10, 0xFE1F, "Vertical Forms", ()), (0xFE20, 0xFE2F, "Combining Half Marks", (CJK,)), (0xFE30, 0xFE4F, "CJK Compatibility Forms", (CJK,)), (0xFE50, 0xFE6F, "Small Form Variants", ()), ( 0xFE70, 0xFEFF, "Arabic Presentation Forms-B", ( ARABIC, ABJAD, ), ), (0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms", (CJK,)), (0xFFF0, 0xFFFF, "Specials", ()), (0x10000, 0x1007F, "Linear B Syllabary", (SYLLABARY,)), (0x10080, 0x100FF, "Linear B Ideograms", ()), (0x10100, 0x1013F, "Aegean Numbers", ()), ( 0x10140, 0x1018F, "Ancient Greek Numbers", ( ALPHABET, GREEK, ), ), (0x10190, 0x101CF, "Ancient Symbols", ()), (0x101D0, 0x101FF, "Phaistos Disc", ()), (0x10280, 0x1029F, "Lycian", ()), (0x102A0, 0x102DF, "Carian", ()), (0x102E0, 0x102FF, "Coptic Epact Numbers", ()), (0x10300, 0x1032F, "Old Italic", ()), (0x10330, 0x1034F, "Gothic", ()), (0x10350, 0x1037F, "Old Permic", ()), (0x10380, 0x1039F, "Ugaritic", (ABJAD,)), (0x103A0, 0x103DF, "Old Persian", (ABJAD,)), (0x10400, 0x1044F, "Deseret", ()), (0x10450, 0x1047F, "Shavian", ()), (0x10480, 0x104AF, "Osmanya", ()), (0x104B0, 0x104FF, "Osage", ()), (0x10500, 0x1052F, "Elbasan", ()), (0x10530, 0x1056F, "Caucasian Albanian", ()), (0x10570, 0x105BF, "Vithkuqi", ()), (0x10600, 0x1077F, "Linear A", ()), ( 0x10780, 0x107BF, "Latin Extended-F", ( ALPHABET, LATIN, ), ), (0x10800, 0x1083F, "Cypriot Syllabary", (SYLLABARY,)), (0x10840, 0x1085F, "Imperial Aramaic", (ABJAD,)), (0x10860, 0x1087F, "Palmyrene", ()), (0x10880, 0x108AF, "Nabataean", (ABJAD,)), (0x108E0, 0x108FF, "Hatran", ()), (0x10900, 0x1091F, "Phoenician", ()), (0x10920, 0x1093F, "Lydian", ()), (0x10980, 0x1099F, "Meroitic Hieroglyphs", ()), (0x109A0, 0x109FF, "Meroitic Cursive", ()), (0x10A00, 0x10A5F, "Kharoshthi", ()), (0x10A60, 0x10A7F, "Old South Arabian", (ABJAD,)), (0x10A80, 0x10A9F, "Old North Arabian", (ABJAD,)), (0x10AC0, 0x10AFF, "Manichaean", ()), (0x10B00, 0x10B3F, "Avestan", ()), (0x10B40, 0x10B5F, "Inscriptional Parthian", ()), (0x10B60, 0x10B7F, "Inscriptional Pahlavi", ()), (0x10B80, 0x10BAF, "Psalter Pahlavi", (ABJAD,)), (0x10C00, 0x10C4F, "Old Turkic", ()), (0x10C80, 0x10CFF, "Old Hungarian", ()), (0x10D00, 0x10D3F, "Hanifi Rohingya", ()), (0x10E60, 0x10E7F, "Rumi Numeral Symbols", ()), ( 0x10E80, 0x10EBF, "Yezidi", ( ALPHABET, FUNKY, ), ), ( 0x10EC0, 0x10EFF, "Arabic Extended-C", ( ARABIC, ABJAD, ), ), (0x10F00, 0x10F2F, "Old Sogdian", (ABJAD,)), (0x10F30, 0x10F6F, "Sogdian", (ABJAD,)), (0x10F70, 0x10FAF, "Old Uyghur", ()), (0x10FB0, 0x10FDF, "Chorasmian", ()), (0x10FE0, 0x10FFF, "Elymaic", ()), (0x11000, 0x1107F, "Brahmi", (ABUGIDA,)), (0x11080, 0x110CF, "Kaithi", ()), (0x110D0, 0x110FF, "Sora Sompeng", ()), (0x11100, 0x1114F, "Chakma", ()), (0x11150, 0x1117F, "Mahajani", ()), (0x11180, 0x111DF, "Sharada", ()), (0x111E0, 0x111FF, "Sinhala Archaic Numbers", ()), (0x11200, 0x1124F, "Khojki", ()), (0x11280, 0x112AF, "Multani", ()), (0x112B0, 0x112FF, "Khudawadi", ()), (0x11300, 0x1137F, "Grantha", ()), (0x11400, 0x1147F, "Newa", ()), (0x11480, 0x114DF, "Tirhuta", ()), (0x11580, 0x115FF, "Siddham", ()), (0x11600, 0x1165F, "Modi", ()), (0x11660, 0x1167F, "Mongolian Supplement", ()), (0x11680, 0x116CF, "Takri", ()), (0x11700, 0x1174F, "Ahom", ()), (0x11800, 0x1184F, "Dogra", ()), (0x118A0, 0x118FF, "Warang Citi", ()), (0x11900, 0x1195F, "Dives Akuru", ()), (0x119A0, 0x119FF, "Nandinagari", ()), (0x11A00, 0x11A4F, "Zanabazar Square", ()), (0x11A50, 0x11AAF, "Soyombo", ()), ( 0x11AB0, 0x11ABF, "Unified Canadian Aboriginal Syllabics Extended-A", (SYLLABARY,), ), (0x11AC0, 0x11AFF, "Pau Cin Hau", ()), (0x11B00, 0x11B5F, "Devanagari Extended-A", ()), (0x11C00, 0x11C6F, "Bhaiksuki", ()), (0x11C70, 0x11CBF, "Marchen", ()), (0x11D00, 0x11D5F, "Masaram Gondi", ()), (0x11D60, 0x11DAF, "Gunjala Gondi", ()), (0x11EE0, 0x11EFF, "Makasar", ()), (0x11F00, 0x11F5F, "Kawi", ()), (0x11FB0, 0x11FBF, "Lisu Supplement", ()), (0x11FC0, 0x11FFF, "Tamil Supplement", ()), (0x12000, 0x123FF, "Cuneiform", ()), (0x12400, 0x1247F, "Cuneiform Numbers and Punctuation", ()), (0x12480, 0x1254F, "Early Dynastic Cuneiform", ()), (0x12F90, 0x12FFF, "Cypro-Minoan", ()), (0x13000, 0x1342F, "Egyptian Hieroglyphs", ()), (0x13430, 0x1345F, "Egyptian Hieroglyph Format Controls", ()), (0x14400, 0x1467F, "Anatolian Hieroglyphs", ()), (0x16800, 0x16A3F, "Bamum Supplement", ()), (0x16A40, 0x16A6F, "Mro", ()), (0x16A70, 0x16ACF, "Tangsa", ()), (0x16AD0, 0x16AFF, "Bassa Vah", ()), (0x16B00, 0x16B8F, "Pahawh Hmong", ()), (0x16E40, 0x16E9F, "Medefaidrin", ()), (0x16F00, 0x16F9F, "Miao", ()), (0x16FE0, 0x16FFF, "Ideographic Symbols and Punctuation", ()), (0x17000, 0x187FF, "Tangut", ()), (0x18800, 0x18AFF, "Tangut Components", ()), (0x18B00, 0x18CFF, "Khitan Small Script", ()), (0x18D00, 0x18D7F, "Tangut Supplement", ()), (0x1AFF0, 0x1AFFF, "Kana Extended-B", ()), (0x1B000, 0x1B0FF, "Kana Supplement", ()), (0x1B100, 0x1B12F, "Kana Extended-A", ()), (0x1B130, 0x1B16F, "Small Kana Extension", ()), (0x1B170, 0x1B2FF, "Nushu", ()), (0x1BC00, 0x1BC9F, "Duployan", ()), (0x1BCA0, 0x1BCAF, "Shorthand Format Controls", ()), (0x1CF00, 0x1CFCF, "Znamenny Musical Notation", ()), (0x1D000, 0x1D0FF, "Byzantine Musical Symbols", ()), (0x1D100, 0x1D1FF, "Musical Symbols", ()), (0x1D200, 0x1D24F, "Ancient Greek Musical Notation", ()), (0x1D2C0, 0x1D2DF, "Kaktovik Numerals", ()), (0x1D2E0, 0x1D2FF, "Mayan Numerals", ()), (0x1D300, 0x1D35F, "Tai Xuan Jing Symbols", ()), (0x1D360, 0x1D37F, "Counting Rod Numerals", ()), (0x1D400, 0x1D7FF, "Mathematical Alphanumeric Symbols", ()), (0x1D800, 0x1DAAF, "Sutton SignWriting", ()), ( 0x1DF00, 0x1DFFF, "Latin Extended-G", ( ALPHABET, LATIN, ), ), (0x1E000, 0x1E02F, "Glagolitic Supplement", ()), ( 0x1E030, 0x1E08F, "Cyrillic Extended-D", ( ALPHABET, CYRILLIC, ), ), (0x1E100, 0x1E14F, "Nyiakeng Puachue Hmong", ()), (0x1E290, 0x1E2BF, "Toto", ()), (0x1E2C0, 0x1E2FF, "Wancho", ()), (0x1E4D0, 0x1E4FF, "Nag Mundari", ()), (0x1E7E0, 0x1E7FF, "Ethiopic Extended-B", (ABUGIDA,)), (0x1E800, 0x1E8DF, "Mende Kikakui", ()), (0x1E900, 0x1E95F, "Adlam", ()), (0x1EC70, 0x1ECBF, "Indic Siyaq Numbers", ()), (0x1ED00, 0x1ED4F, "Ottoman Siyaq Numbers", ()), ( 0x1EE00, 0x1EEFF, "Arabic Mathematical Alphabetic Symbols", ( ARABIC, ABJAD, ), ), (0x1F000, 0x1F02F, "Mahjong Tiles", ()), (0x1F030, 0x1F09F, "Domino Tiles", ()), (0x1F0A0, 0x1F0FF, "Playing Cards", ()), (0x1F100, 0x1F1FF, "Enclosed Alphanumeric Supplement", ()), (0x1F200, 0x1F2FF, "Enclosed Ideographic Supplement", ()), (0x1F300, 0x1F5FF, "Miscellaneous Symbols and Pictographs", ()), (0x1F600, 0x1F64F, "Emoticons", ()), (0x1F650, 0x1F67F, "Ornamental Dingbats", ()), (0x1F680, 0x1F6FF, "Transport and Map Symbols", ()), (0x1F700, 0x1F77F, "Alchemical Symbols", ()), (0x1F780, 0x1F7FF, "Geometric Shapes Extended", ()), (0x1F800, 0x1F8FF, "Supplemental Arrows-C", ()), (0x1F900, 0x1F9FF, "Supplemental Symbols and Pictographs", ()), (0x1FA00, 0x1FA6F, "Chess Symbols", ()), (0x1FA70, 0x1FAFF, "Symbols and Pictographs Extended-A", ()), (0x1FB00, 0x1FBFF, "Symbols for Legacy Computing", ()), (0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B", (CJK,)), (0x2A700, 0x2B73F, "CJK Unified Ideographs Extension C", (CJK,)), (0x2B740, 0x2B81F, "CJK Unified Ideographs Extension D", (CJK,)), (0x2B820, 0x2CEAF, "CJK Unified Ideographs Extension E", (CJK,)), (0x2CEB0, 0x2EBEF, "CJK Unified Ideographs Extension F", (CJK,)), (0x2EBF0, 0x2EE5F, "CJK Unified Ideographs Extension I", (CJK,)), (0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement", (CJK,)), (0x30000, 0x3134F, "CJK Unified Ideographs Extension G", (CJK,)), (0x31350, 0x323AF, "CJK Unified Ideographs Extension H", (CJK,)), (0xE0000, 0xE007F, "Tags", ()), (0xE0100, 0xE01EF, "Variation Selectors Supplement", ()), (0xF0000, 0xFFFFF, "Supplementary Private Use Area-A", ()), (0x100000, 0x10FFFF, "Supplementary Private Use Area-B", ()), ) BLOCK_TAGS = [(s, e, t) for s, e, _, t in UNICODE_BLOCKS if len(t)] @lru_cache(maxsize=5000) def char_tags(char: str) -> Tuple[int, ...]: """Get the tags applicable to a particular character.""" codepoint = ord(char) for start, end, tags in BLOCK_TAGS: if start <= codepoint <= end: return tags return () def is_modern_alphabet(word: str) -> bool: """Check if a word is written in a modern alphabet. The term alphabet is used in a narrow sense here: it includes only alphabets that have vowels and are safely transliterated to latin. Basically: Cyrillic, Greek, Armenian, and Latin.""" for char in word: tags = char_tags(char) if not len(tags): continue if ALPHABET not in tags: return False if HISTORIC in tags or FUNKY in tags: return False return True def is_latin(word: str) -> bool: """Check if a word is written in the latin alphabet.""" for char in word: tags = char_tags(char) if not len(tags): continue if LATIN not in tags: return False if HISTORIC in tags or FUNKY in tags: return False return True normality-3.0.1/normality/slugify.py000066400000000000000000000026011504141236600176040ustar00rootroot00000000000000import string from typing import Any, Optional from normality.cleaning import squash_spaces, category_replace from normality.constants import SLUG_CATEGORIES, WS from normality.transliteration import ascii_text from normality.stringify import stringify VALID_CHARS = string.ascii_lowercase + string.digits + WS def slugify(value: Any, sep: str = "-") -> Optional[str]: """A simple slug generator. Slugs are pure ASCII lowercase strings that can be used in URLs an other places where a name has to be machine-safe. Consider using :func:`normality.slugify_text` instead, which avoids unnecessary stringification and is more efficient.""" text = stringify(value) if text is None: return None return slugify_text(text, sep=sep) def slugify_text(text: str, sep: str = "-") -> Optional[str]: """Slugify a text string. This will transliterate the text to ASCII, replace whitespace with the given separator, and remove all characters that are not alphanumeric or the separator.""" text = text.lower().replace(sep, WS) # run this first because it'll give better results on special # characters. replaced = category_replace(text, SLUG_CATEGORIES) text = ascii_text(replaced) text = squash_spaces(text) text = "".join([c for c in text if c in VALID_CHARS]) if len(text) == 0: return None return text.replace(WS, sep) normality-3.0.1/normality/stringify.py000066400000000000000000000031121504141236600201360ustar00rootroot00000000000000from datetime import datetime, date from decimal import Decimal from typing import Any, Optional from normality.cleaning import remove_unsafe_chars from normality.encoding import predict_encoding from normality.encoding import DEFAULT_ENCODING def _clean_empty(value: str) -> Optional[str]: # XXX: is this really a good idea? value = value.strip() if not len(value): return None return value def stringify( value: Any, encoding_default: str = DEFAULT_ENCODING, encoding: Optional[str] = None ) -> Optional[str]: """Brute-force convert a given object to a string. This will attempt an increasingly mean set of conversions to make a given object into a unicode string. It is guaranteed to either return unicode or None, if all conversions failed (or the value is indeed empty). """ if value is None: return None if isinstance(value, str): return _clean_empty(value) if isinstance(value, (date, datetime)): return value.isoformat() elif isinstance(value, float): # Avoid trailing zeros and limit to 3 decimal places: return format(value, ".3f").rstrip("0").rstrip(".") elif isinstance(value, Decimal): return Decimal(value).to_eng_string() elif isinstance(value, bytes): if encoding is None: encoding = predict_encoding(value, default=encoding_default) value = value.decode(encoding, "replace") value = remove_unsafe_chars(value) if value is None: return None return _clean_empty(value) return _clean_empty(str(value)) normality-3.0.1/normality/transliteration.py000066400000000000000000000053771504141236600213610ustar00rootroot00000000000000""" Transliterate the given text to the latin script. This attempts to convert a given text to latin script using the closest match of characters vis a vis the original script. Transliteration requires an extensive unicode mapping. Since all Python implementations are either GPL-licensed (and thus more restrictive than this library) or come with a massive C code dependency, this module requires neither but will use a package if it is installed. """ from typing import Callable from functools import lru_cache import warnings from icu import Transliterator # type: ignore Trans = Callable[[str], str] # Transform to latin, separate accents, decompose, remove # symbols, compose, push to ASCII ASCII_SCRIPT = "Any-Latin; NFKD; [:Nonspacing Mark:] Remove; Accents-Any; [:Symbol:] Remove; [:Nonspacing Mark:] Remove; Latin-ASCII" # noqa # nb. 2021-11-05 Accents-Any is now followed with another nonspacing mark remover. # This script is becoming a bit silly, there has to be a nicer way to do this? _ASCII: Trans = Transliterator.createInstance(ASCII_SCRIPT).transliterate _LATINIZE: Trans = Transliterator.createInstance("Any-Latin").transliterate MAX_ASCII = 127 # No non-ASCII characters below this point. MAX_LATIN = 740 # No non-latin characters below this point. class ICUWarning(UnicodeWarning): pass @lru_cache(maxsize=2**16) def latinize_text(text: str, ascii: bool = False) -> str: """Transliterate the given text to the latin script. This attempts to convert a given text to latin script using the closest match of characters vis a vis the original script. """ if text is None: warnings.warn( "normality.latinize_text will stop handling None soon.", DeprecationWarning, stacklevel=2, ) return "" if ascii: return ascii_text(text) is_latin = True for char in text: if ord(char) > MAX_LATIN: is_latin = False break if is_latin: # If the text is already latin, we can just return it. return text return _LATINIZE(text) def ascii_text(text: str) -> str: """Transliterate the given text and make sure it ends up as ASCII.""" if text is None: warnings.warn( "normality.ascii_text will stop handling None soon.", DeprecationWarning, stacklevel=2, ) return "" is_ascii = True for char in text: if ord(char) > MAX_ASCII: is_ascii = False break if is_ascii: # If the text is already ASCII, we can just return it. return text return _ascii_text(text) @lru_cache(maxsize=2**16) def _ascii_text(text: str) -> str: result = _ASCII(text) return result.encode("ascii", "replace").decode("ascii") normality-3.0.1/normality/util.py000066400000000000000000000003431504141236600171000ustar00rootroot00000000000000# Given the whole thing is a utility package, this is really meta. from typing import Any, Dict, Optional Categories = Dict[str, Optional[str]] Encoding = str def is_text(data: Any) -> bool: return isinstance(data, str) normality-3.0.1/pyproject.toml000066400000000000000000000024301504141236600164460ustar00rootroot00000000000000[build-system] requires = ["hatchling"] build-backend = "hatchling.build" [project] name = "normality" version = "3.0.1" authors = [{ name = "Friedrich Lindenberg", email = "friedrich@pudo.org" }] license = { file = "LICENSE" } description = "Micro-library to normalize text strings" readme = "README.md" keywords = ["text", "unicode", "normalization", "slugs"] classifiers = [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] requires-python = ">= 3.9" dependencies = [ "banal >= 1.0.1", "pyicu >= 2.10.0", "chardet", "charset-normalizer >= 2.0.0", ] # dynamic = ["version"] [project.urls] Documentation = "https://github.com/pudo/normality" Repository = "https://github.com/pudo/normality.git" Issues = "https://github.com/pudo/normality/issues" [project.optional-dependencies] icu = [] dev = ["mypy", "pytest", "types-chardet", "wheel", "build"] [tool.hatch.build.targets.sdist] only-include = ["normality", "LICENSE", "README.md"] [tool.distutils.bdist_wheel] universal = true normality-3.0.1/tests/000077500000000000000000000000001504141236600146755ustar00rootroot00000000000000normality-3.0.1/tests/__init__.py000066400000000000000000000000001504141236600167740ustar00rootroot00000000000000normality-3.0.1/tests/fixtures/000077500000000000000000000000001504141236600165465ustar00rootroot00000000000000normality-3.0.1/tests/fixtures/utf-16.txt000066400000000000000000000000701504141236600203260ustar00rootroot00000000000000>@>H5=:> 5B@> ;5:AV9>28Gnormality-3.0.1/tests/test_cleaning.py000066400000000000000000000027131504141236600200710ustar00rootroot00000000000000from normality.cleaning import remove_unsafe_chars, collapse_spaces, squash_spaces def test_remove_unsafe_chars(): assert remove_unsafe_chars(None) == "" # type: ignore assert remove_unsafe_chars("") == "" assert remove_unsafe_chars(" ") == " " assert remove_unsafe_chars("\u2028 ") == " " assert remove_unsafe_chars("\ufeff ") == " " assert remove_unsafe_chars("lalala\ufeff ") == "lalala\ufeff " assert remove_unsafe_chars("lalala\u200bx") == "lalalax" def test_collapse_spaces(): assert collapse_spaces(None) is None # type: ignore assert collapse_spaces("") is None assert collapse_spaces(" ") is None assert collapse_spaces(" ") is None assert collapse_spaces(" \n ") is None assert collapse_spaces(" \n\n ") is None assert collapse_spaces(" \njfshdhdfjk\n ") == "jfshdhdfjk" assert collapse_spaces(" \njfshd\t\thdfjk\n ") == "jfshd hdfjk" assert collapse_spaces(" \n\u2028\u2029\u200b\u200c\n ") is None assert collapse_spaces("a\u200bx") == "a x" def test_squash_spaces(): assert squash_spaces("") == "" assert squash_spaces(" ") == "" assert squash_spaces(" ") == "" assert squash_spaces(" \n ") == "" assert squash_spaces(" \n\n ") == "" assert squash_spaces(" \njfshdhdfjk\n ") == "jfshdhdfjk" assert squash_spaces(" \njfshd\t\thdfjk\n ") == "jfshd hdfjk" assert squash_spaces(" \n\u2028\u2029\u200b\u200c\n ") == "" assert squash_spaces("a\u200bx") == "a x" normality-3.0.1/tests/test_normality.py000066400000000000000000000061261504141236600203310ustar00rootroot00000000000000from datetime import datetime, UTC from normality import normalize, latinize_text, ascii_text from normality import ( stringify, slugify, guess_encoding, guess_file_encoding, predict_file_encoding, predict_encoding, ) def test_empty(): assert slugify(None) is None assert ascii_text(None) == "" # type: ignore assert ascii_text("") == "" assert latinize_text(None) == "" # latinize_text returns empty string for None assert normalize(None) is None assert normalize("") is None assert normalize(" ") is None def test_petro(): text = "Порошенко Петро Олексійович" assert slugify(text) == "porosenko-petro-oleksijovic" assert ascii_text(text) == "Porosenko Petro Oleksijovic" assert latinize_text(text) == "Porošenko Petro Oleksíjovič" assert normalize(text) == "порошенко петро олексіиович" def test_ahmad(): text = "əhməd" assert ascii_text(text) == "ahmad" def test_azeri(): text = "FUAD ALIYEV ƏHMƏD OĞLU" assert ascii_text(text) == "FUAD ALIYEV AHMAD OGLU" def test_slugify(): text = "BABY! camel-is good" assert slugify(text, sep="-") == "baby-camel-is-good" assert slugify("testʼs", sep="-") == "tests" assert slugify("test_s", sep="-") == "test-s" assert slugify("-", sep="-") is None assert slugify("", sep="-") is None assert slugify("- -", sep="-") is None assert slugify(None, sep="-") is None def test_georgian(): text = "ავლაბრის ფონდი" assert ascii_text(text) == "avlabris pondi" def test_german(): text = "Häschen Spaß" assert ascii_text(text) == "Haschen Spass" assert slugify(text, sep="-") == "haschen-spass" def test_stringify(): assert stringify(" . ") == "." assert stringify(5) == "5" assert stringify(0.5) == "0.5" def test_stringify_datetime(): dt = datetime.now(UTC) text = stringify(dt) assert text is not None assert text.startswith("%s-" % dt.year), text def test_guess_encoding(): text = "Порошенко Петро Олексійович" encoded = text.encode("iso-8859-5") out = guess_encoding(encoded) assert out == "iso8859-5" def test_predict_encoding(): text = "Порошенко Петро Олексійович" encoded = text.encode("iso-8859-5") out = predict_encoding(encoded) assert out == "iso8859-5" def test_guess_file_encoding(): with open("tests/fixtures/utf-16.txt", "rb") as fh: out = guess_file_encoding(fh) assert out == "utf-16" def test_predict_file_encoding(): with open("tests/fixtures/utf-16.txt", "rb") as fh: out = predict_file_encoding(fh) assert out == "utf-16" def test_petro_iso_encoded(): text = "Порошенко Петро Олексійович" encoded = text.encode("iso8859-5") out = stringify(encoded) assert out == text def test_petro_utf16_encoded(): text = "Порошенко Петро Олексійович" encoded = text.encode("utf-16") out = stringify(encoded) assert out == text normality-3.0.1/tests/test_paths.py000066400000000000000000000021451504141236600174270ustar00rootroot00000000000000from normality.paths import MAX_LENGTH, safe_filename def test_safe_filename(): assert safe_filename(None) is None assert safe_filename("test.txt") == "test.txt" assert safe_filename("test .txt") == "test.txt" assert safe_filename("test bla.txt") == "test_bla.txt" assert safe_filename("test_bla.txt") == "test_bla.txt" assert safe_filename("test.bla.txt") == "test_bla.txt" assert safe_filename("test", extension="txt") == "test.txt" def test_long_filename(): long_name = ["long name"] * 100 long_name = "-".join(long_name) shortened = safe_filename(long_name) assert shortened is not None assert len(shortened) <= MAX_LENGTH, shortened shortened = safe_filename(long_name, extension="html") assert shortened is not None assert len(shortened) <= MAX_LENGTH, shortened shortened = safe_filename("bla", extension=long_name) assert shortened is not None assert len(shortened) <= MAX_LENGTH, shortened shortened = safe_filename(long_name, extension=long_name) assert shortened is not None assert len(shortened) <= MAX_LENGTH, shortened normality-3.0.1/tests/test_scripts.py000066400000000000000000000012371504141236600200000ustar00rootroot00000000000000from normality.scripts import ALPHABET, CYRILLIC, CJK from normality.scripts import char_tags, is_modern_alphabet def test_char_tags(): assert ALPHABET in char_tags("a") assert CYRILLIC not in char_tags("a") assert CYRILLIC in char_tags("д") assert CJK in char_tags("近") assert ALPHABET not in char_tags("近") def test_is_modern_alphabet(): assert not is_modern_alphabet(" 习近平") assert is_modern_alphabet("Xí Jìnpíng") assert is_modern_alphabet("Ротенберг Аркадий") assert is_modern_alphabet(".,[]{}()!@#$%^&*()_+)«»‘“") assert not is_modern_alphabet("တပ်မတော်(ကြည်")