pax_global_header00006660000000000000000000000064145613366700014525gustar00rootroot0000000000000052 comment=84d88dca52d591481b986ae386108e632b32cd61 tiktoken-0.6.0/000077500000000000000000000000001456133667000133605ustar00rootroot00000000000000tiktoken-0.6.0/.github/000077500000000000000000000000001456133667000147205ustar00rootroot00000000000000tiktoken-0.6.0/.github/workflows/000077500000000000000000000000001456133667000167555ustar00rootroot00000000000000tiktoken-0.6.0/.github/workflows/build_wheels.yml000066400000000000000000000043701456133667000221520ustar00rootroot00000000000000name: Build wheels on: [push, pull_request, workflow_dispatch] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: build_wheels: name: py${{ matrix.python-version }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: # cibuildwheel builds linux wheels inside a manylinux container # it also takes care of procuring the correct python version for us os: [ubuntu-latest, windows-latest, macos-latest] python-version: [38, 39, 310, 311, 312] steps: - uses: actions/checkout@v4 - uses: pypa/cibuildwheel@v2.16.5 env: CIBW_BUILD: "cp${{ matrix.python-version}}-*" - uses: actions/upload-artifact@v3 with: name: dist path: ./wheelhouse/*.whl build_wheels_aarch64: name: py${{ matrix.python-version }} on ${{ matrix.os }} (aarch64) runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: [ubuntu-latest] python-version: [38, 39, 310, 311, 312] steps: - uses: actions/checkout@v4 - name: Setup up QEMU uses: docker/setup-qemu-action@v3 with: platforms: arm64 - name: Build wheels uses: pypa/cibuildwheel@v2.16.5 env: CIBW_BUILD: "cp${{ matrix.python-version}}-*" CIBW_ARCHS: aarch64 CIBW_BUILD_VERBOSITY: 3 # https://github.com/rust-lang/cargo/issues/10583 CIBW_ENVIRONMENT_LINUX: PATH="$PATH:$HOME/.cargo/bin" CARGO_NET_GIT_FETCH_WITH_CLI=true - uses: actions/upload-artifact@v3 with: name: dist path: ./wheelhouse/*.whl build_sdist: name: sdist runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 name: Install Python with: python-version: "3.9" - name: Run check-manifest run: | pip install check-manifest check-manifest -v - name: Build sdist run: | pip install --upgrade build python -m build --sdist - uses: actions/upload-artifact@v3 with: name: dist path: ./dist/*.tar.gz tiktoken-0.6.0/.gitignore000066400000000000000000000006231456133667000153510ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # Environments .env .venv # Tools .mypy_cache .coverage .hypothesis htmlcov # General .DS_Store Cargo.lock target/ tiktoken-0.6.0/CHANGELOG.md000066400000000000000000000042641456133667000151770ustar00rootroot00000000000000# Changelog This is the changelog for the open source version of tiktoken. ## [v0.6.0] - Optimise regular expressions for a 20% performance improvement - Add `text-embedding-3-*` models to `encoding_for_model` - Check content hash for downloaded files - Allow pickling `Encoding` objects. Registered `Encoding` will be pickled by reference - Workaround PyO3 bug for frozenset conversion Thank you to @paplorinc, @mdwelsh, @Praneet460! ## [v0.5.2] - Build wheels for Python 3.12 - Update version of PyO3 to allow multiple imports - Avoid permission errors when using default cache logic ## [v0.5.1] - Add `encoding_name_for_model`, undo some renames to variables that are implementation details ## [v0.5.0] - Add `tiktoken._educational` submodule to better document how byte pair encoding works - Ensure `encoding_for_model` knows about several new models - Add `decode_with_offets` - Better error for failures with the plugin mechanism - Make more tests public - Update versions of dependencies ## [v0.4.0] - Add `decode_batch` and `decode_bytes_batch` - Improve error messages and handling ## [v0.3.3] - `tiktoken` will now make a best effort attempt to replace surrogate pairs with the corresponding Unicode character and will replace lone surrogates with the Unicode replacement character. ## [v0.3.2] - Add encoding for GPT-4 ## [v0.3.1] - Build aarch64 wheels - Make `blobfile` an optional dependency Thank you to @messense for the environment variable that makes cargo not OOM under emulation! ## [v0.3.0] - Improve performance by 5-20%; thank you to @nistath! - Add `gpt-3.5-turbo` models to `encoding_for_model` - Add prefix matching to `encoding_for_model` to better support future model versions - Fix a bug in the README instructions on extending tiktoken - Update the set of available encodings - Add packaging metadata ## [v0.2.0] - Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model - Improve portability of caching logic Thank you to @fritzo, @arvid220u, @khanhvu207, @henriktorget for various small corrections ## [v0.1.2] - Avoid use of `blobfile` for public files - Add support for Python 3.8 - Add py.typed - Improve the public tests ## [v0.1.1] - Initial release tiktoken-0.6.0/Cargo.toml000066400000000000000000000004741456133667000153150ustar00rootroot00000000000000[package] name = "tiktoken" version = "0.6.0" edition = "2021" rust-version = "1.57.0" [lib] name = "_tiktoken" crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.20.0", features = ["extension-module"] } # tiktoken dependencies fancy-regex = "0.11.0" regex = "1.8.3" rustc-hash = "1.1.0" bstr = "1.5.0" tiktoken-0.6.0/LICENSE000066400000000000000000000020661456133667000143710ustar00rootroot00000000000000MIT License Copyright (c) 2022 OpenAI, Shantanu Jain Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. tiktoken-0.6.0/MANIFEST.in000066400000000000000000000002521456133667000151150ustar00rootroot00000000000000include *.svg include *.toml include *.md include Makefile global-include py.typed recursive-include scripts *.py recursive-include tests *.py recursive-include src *.rs tiktoken-0.6.0/README.md000066400000000000000000000112411456133667000146360ustar00rootroot00000000000000# ⏳ tiktoken tiktoken is a fast [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokeniser for use with OpenAI's models. ```python import tiktoken enc = tiktoken.get_encoding("cl100k_base") assert enc.decode(enc.encode("hello world")) == "hello world" # To get the tokeniser corresponding to a specific model in the OpenAI API: enc = tiktoken.encoding_for_model("gpt-4") ``` The open source version of `tiktoken` can be installed from PyPI: ``` pip install tiktoken ``` The tokeniser API is documented in `tiktoken/core.py`. Example code using `tiktoken` can be found in the [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb). ## Performance `tiktoken` is between 3-6x faster than a comparable open source tokeniser: ![image](https://raw.githubusercontent.com/openai/tiktoken/main/perf.svg) Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from `tokenizers==0.13.2`, `transformers==4.24.0` and `tiktoken==0.2.0`. ## Getting help Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues). If you work at OpenAI, make sure to check the internal documentation or feel free to contact @shantanu. ## What is BPE anyway? Language models don't see text like you and I, instead they see a sequence of numbers (known as tokens). Byte pair encoding (BPE) is a way of converting text into tokens. It has a couple desirable properties: 1) It's reversible and lossless, so you can convert tokens back into the original text 2) It works on arbitrary text, even text that is not in the tokeniser's training data 3) It compresses the text: the token sequence is shorter than the bytes corresponding to the original text. On average, in practice, each token corresponds to about 4 bytes. 4) It attempts to let the model see common subwords. For instance, "ing" is a common subword in English, so BPE encodings will often split "encoding" into tokens like "encod" and "ing" (instead of e.g. "enc" and "oding"). Because the model will then see the "ing" token again and again in different contexts, it helps models generalise and better understand grammar. `tiktoken` contains an educational submodule that is friendlier if you want to learn more about the details of BPE, including code that helps visualise the BPE procedure: ```python from tiktoken._educational import * # Train a BPE tokeniser on a small amount of text enc = train_simple_encoding() # Visualise how the GPT-4 encoder encodes text enc = SimpleBytePairEncoding.from_tiktoken("cl100k_base") enc.encode("hello world aaaaaaaaaaaa") ``` ## Extending tiktoken You may wish to extend `tiktoken` to support new encodings. There are two ways to do this. **Create your `Encoding` object exactly the way you want and simply pass it around.** ```python cl100k_base = tiktoken.get_encoding("cl100k_base") # In production, load the arguments directly instead of accessing private attributes # See openai_public.py for examples of arguments for specific encodings enc = tiktoken.Encoding( # If you're changing the set of special tokens, make sure to use a different name # It should be clear from the name what behaviour to expect. name="cl100k_im", pat_str=cl100k_base._pat_str, mergeable_ranks=cl100k_base._mergeable_ranks, special_tokens={ **cl100k_base._special_tokens, "<|im_start|>": 100264, "<|im_end|>": 100265, } ) ``` **Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.** This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer option 1. To do this, you'll need to create a namespace package under `tiktoken_ext`. Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file: ``` my_tiktoken_extension ├── tiktoken_ext │   └── my_encodings.py └── setup.py ``` `my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`. This is a dictionary from an encoding name to a function that takes no arguments and returns arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see `tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`. Your `setup.py` should look something like this: ```python from setuptools import setup, find_namespace_packages setup( name="my_tiktoken_extension", packages=find_namespace_packages(include=['tiktoken_ext*']), install_requires=["tiktoken"], ... ) ``` Then simply `pip install ./my_tiktoken_extension` and you should be able to use your custom encodings! Make sure **not** to use an editable install. tiktoken-0.6.0/perf.svg000066400000000000000000000367451456133667000150540ustar00rootroot00000000000000 Throughput 0 MB/s 10 MB/s 20 MB/s 30 MB/s 40 MB/s Thread count 1 2 4 8 16 32 64 tiktoken huggingface tiktoken-0.6.0/pyproject.toml000066400000000000000000000027741456133667000163060ustar00rootroot00000000000000[project] name = "tiktoken" version = "0.6.0" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" readme = "README.md" license = {file = "LICENSE"} authors = [{name = "Shantanu Jain"}, {email = "shantanu@openai.com"}] dependencies = ["regex>=2022.1.18", "requests>=2.26.0"] optional-dependencies = {blobfile = ["blobfile>=2"]} requires-python = ">=3.8" [project.urls] homepage = "https://github.com/openai/tiktoken" repository = "https://github.com/openai/tiktoken" changelog = "https://github.com/openai/tiktoken/blob/main/CHANGELOG.md" [build-system] build-backend = "setuptools.build_meta" requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"] [tool.cibuildwheel] build-frontend = "build" build-verbosity = 1 linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y" linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" } macos.before-all = "rustup target add aarch64-apple-darwin" skip = [ "*-manylinux_i686", "*-musllinux_i686", "*-win32", ] macos.archs = ["x86_64", "arm64"] # When cross-compiling on Intel, it is not possible to test arm64 wheels. # Warnings will be silenced with following CIBW_TEST_SKIP test-skip = "*-macosx_arm64" before-test = "pip install pytest hypothesis" test-command = "pytest {project}/tests --import-mode=append" [[tool.cibuildwheel.overrides]] select = "*linux_aarch64" test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'""" tiktoken-0.6.0/scripts/000077500000000000000000000000001456133667000150475ustar00rootroot00000000000000tiktoken-0.6.0/scripts/benchmark.py000066400000000000000000000017501456133667000173560ustar00rootroot00000000000000import base64 import functools import gzip import json import os import random import time from typing import Any, cast import blobfile import tiktoken def benchmark_batch(documents: list[str]) -> None: num_threads = int(os.environ["RAYON_NUM_THREADS"]) num_bytes = sum(map(len, map(str.encode, documents))) print(f"num_threads: {num_threads}, num_bytes: {num_bytes}") enc = tiktoken.get_encoding("gpt2") enc.encode("warmup") start = time.perf_counter_ns() enc.encode_ordinary_batch(documents, num_threads=num_threads) end = time.perf_counter_ns() print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s") import transformers hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2") hf_enc.model_max_length = 1e30 # silence! hf_enc.encode("warmup") start = time.perf_counter_ns() hf_enc(documents) end = time.perf_counter_ns() print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s") tiktoken-0.6.0/scripts/redact.py000066400000000000000000000033221456133667000166630ustar00rootroot00000000000000import argparse import re import subprocess from pathlib import Path def redact_file(path: Path, dry_run: bool) -> None: if not path.exists() or path.is_dir(): return text = path.read_text() if not text: return first_line = text.splitlines()[0] if "redact" in first_line: if not dry_run: path.unlink() print(f"Deleted {path}") return pattern = "|".join( r" *" + re.escape(x) for x in [ "# ===== redact-beg =====\n", "# ===== redact-end =====\n", "\n", "\n", ] ) if re.search(pattern, text): redacted_text = "".join(re.split(pattern, text)[::2]) if not dry_run: path.write_text(redacted_text) print(f"Redacted {path}") return print(f"Skipped {path}") def redact(dry_run: bool) -> None: tiktoken_root = Path(__file__).parent.parent assert tiktoken_root.name == "tiktoken" assert (tiktoken_root / "pyproject.toml").exists() try: output = subprocess.check_output(["git", "ls-files"], cwd=tiktoken_root, text=True) paths = [Path(p) for p in output.splitlines()] except subprocess.CalledProcessError: paths = list(tiktoken_root.glob("**/*")) for path in paths: redact_file(path, dry_run=dry_run) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--dry-run", type=lambda x: not x or x[0].lower() != "f", default=True) args = parser.parse_args() redact(args.dry_run) if args.dry_run: print("Dry run, use --dry-run=false to actually redact files") if __name__ == "__main__": main() tiktoken-0.6.0/setup.py000066400000000000000000000010331456133667000150670ustar00rootroot00000000000000from setuptools import setup from setuptools_rust import Binding, RustExtension setup( name="tiktoken", rust_extensions=[ RustExtension( "tiktoken._tiktoken", binding=Binding.PyO3, # Between our use of editable installs and wanting to use Rust for performance sensitive # code, it makes sense to just always use --release debug=False, ) ], package_data={"tiktoken": ["py.typed"]}, packages=["tiktoken", "tiktoken_ext"], zip_safe=False, ) tiktoken-0.6.0/src/000077500000000000000000000000001456133667000141475ustar00rootroot00000000000000tiktoken-0.6.0/src/lib.rs000066400000000000000000000575551456133667000153040ustar00rootroot00000000000000// This check is new and seems buggy (possibly with PyO3 interaction) #![allow(clippy::borrow_deref_ref)] use std::collections::HashSet; use std::thread; use fancy_regex::Regex; use pyo3::exceptions; use pyo3::prelude::*; use pyo3::types::{PyBytes, PyList, PyTuple}; use pyo3::PyResult; use rustc_hash::FxHashMap as HashMap; fn _byte_pair_merge( piece: &[u8], ranks: &HashMap, usize>, f: impl Fn(std::ops::Range) -> T, ) -> Vec { // This is a vector of (start, rank). // The rank is of the byte pair starting at position start. // The rank of the last item in the vector is not a valid value. let mut parts: Vec<(usize, usize)> = (0..piece.len() + 1).map(|i| (i, usize::MAX)).collect(); let get_rank = { #[inline(always)] |parts: &Vec<(usize, usize)>, start_idx: usize, skip: usize| { if (start_idx + skip + 2) < parts.len() { ranks .get(&piece[parts[start_idx].0..parts[start_idx + skip + 2].0]) .copied() } else { None } } }; // We look up the ranks once in the beginning and iteratively update // them during each merge, which reduces the number of rank lookups. for i in 0..parts.len() - 2 { match get_rank(&parts, i, 0) { Some(rank) => { // usize::MAX is a sentinel value and cannot be a valid rank debug_assert!(rank != usize::MAX); parts[i].1 = rank; } None => { continue; } }; } // If you have n parts and m merges, this does O(mn) work. // We could do something with a heap and do O(m log n) work. // It is important to consider that n is often small (<100), and as such // the cache-locality benefits outweigh the algorithmic complexity downsides // of the `parts` vector data structure above. // Note that we hash bytes, not token pairs. As long as we train BPE the way we // currently do, this is equivalent. An easy way to break this would be to decouple // merge priority from token index or to prevent specific token merges. loop { if parts.len() == 1 { break; } // usize::MAX is a sentinel rank value allowing us to // take the min more quickly let mut min_rank: (usize, usize) = (usize::MAX, 0); for (i, &(_, rank)) in parts[..parts.len() - 1].iter().enumerate() { if rank < min_rank.0 { min_rank = (rank, i); } } if min_rank.0 != usize::MAX { let i = min_rank.1; // NOTE: We are about to remove parts[i + 1]. We do not do it // yet because there are cache-locality benefits to updating // parts[i] and parts[i-1] before removing, which could thrash // the cache. Thus, we update the rank calculation by skipping over // parts[i + 1], by invoking `get_rank!` with `skip = 1`. parts[i].1 = get_rank(&parts, i, 1).unwrap_or(usize::MAX); if i > 0 { parts[i - 1].1 = get_rank(&parts, i - 1, 1).unwrap_or(usize::MAX); } parts.remove(i + 1); } else { break; } } let mut out: Vec = Vec::with_capacity(parts.len() - 1); for i in 0..parts.len() - 1 { out.push(f(parts[i].0..parts[i + 1].0)); } out } pub fn byte_pair_encode(piece: &[u8], ranks: &HashMap, usize>) -> Vec { if piece.len() == 1 { return vec![ranks[piece]]; } _byte_pair_merge(piece, ranks, |p| ranks[&piece[p.start..p.end]]) } pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap, usize>) -> Vec<&'a [u8]> { if piece.len() == 1 { return vec![piece]; } _byte_pair_merge(piece, ranks, |p| &piece[p.start..p.end]) } // Various performance notes: // // Regex // ===== // Most of the time is spent in regex. The easiest way to speed this up is by using less fancy // regex features. For instance, using a regex parse-able by `regex` crate is 3x faster than // the usual regex we use. // // However, given that we're using a regex parse-able by `regex`, there isn't much difference // between using the `regex` crate and using the `fancy_regex` crate. // // There is an important interaction between threading, `regex` and `fancy_regex`. // When using `fancy_regex`, we hit `regex.find_at`. It turns out that this causes contention on // some mutable scratch space inside of `regex`. This absolutely kills performance. When using plain // old `regex`, we don't hit this, because `find_iter` has a different code path. // Related: https://github.com/rust-lang/regex/blob/master/PERFORMANCE.md // Anyway, the way we get around this is with having a (mostly) thread local clone of the regex for // each thread. // // Threading // ========= // I tried using `rayon`. It wasn't really faster than using Python threads and releasing the GIL. // So goodbye `rayon`! Let thread count etc be in control of our Python users. // // Caching // ======= // The reference tokeniser has an lru cache over the equivalent of `byte_pair_encode`. // Originally, we had one too! Without it, we were only vaguely faster than Python. // I used an RWLock to protect the cache. This didn't seem to hurt single threaded performance // noticeably, but it did affect multi-threaded performance. Weirdly, it seemed to affect // multi-threaded performance even when I only had readers (maybed I messed something up?). // Anyway, I realised that we could get rid of the cache, if we treat the set of tokens as a cache! // These are exactly the set or merges that are likely to be hot. And now we don't have to think // about interior mutability, memory use, or cloning. // // Hashing // ======= // We use FxHashMap instead of the standard HashMap. This is maybe like a 5-10% win? // The current implementation ends up doing a lot of hashing of bytes. In theory, this could be made // to be hashing of two-tuples of ints, which looks like it may also be a couple percent faster. use std::num::NonZeroU64; pub struct FakeThreadId(NonZeroU64); fn hash_current_thread() -> usize { // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter // that works great for our use case of avoiding collisions in our array. Unfortunately, // it's private. However, there are only so many ways you can layout a u64, so just transmute // https://github.com/rust-lang/rust/issues/67939 const _: [u8; 8] = [0; std::mem::size_of::()]; const _: [u8; 8] = [0; std::mem::size_of::()]; let x = unsafe { std::mem::transmute::(thread::current().id()).0 }; u64::from(x) as usize } const MAX_NUM_THREADS: usize = 128; #[pyclass] struct CoreBPE { encoder: HashMap, usize>, special_tokens_encoder: HashMap, decoder: HashMap>, special_tokens_decoder: HashMap>, regex_tls: Vec, special_regex_tls: Vec, sorted_token_bytes: Vec>, } impl CoreBPE { fn _get_tl_regex(&self) -> &Regex { // See performance notes above for what this is about // It's also a little janky, please make a better version of it! // However, it's nice that this doesn't leak memory to short-lived threads &self.regex_tls[hash_current_thread() % MAX_NUM_THREADS] } fn _get_tl_special_regex(&self) -> &Regex { &self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS] } fn _decode_native(&self, tokens: &[usize]) -> Vec { let mut ret = Vec::with_capacity(tokens.len() * 2); for token in tokens { let token_bytes = self .decoder .get(token) .unwrap_or_else(|| &self.special_tokens_decoder[token]); ret.extend(token_bytes); } ret } fn _encode_ordinary_native(&self, text: &str) -> Vec { // This is the core of the encoding logic; the other functions in here // just make things complicated :-) let regex = self._get_tl_regex(); let mut ret = vec![]; for mat in regex.find_iter(text) { let piece = mat.unwrap().as_str().as_bytes(); if let Some(token) = self.encoder.get(piece) { ret.push(*token); continue; } ret.extend(&byte_pair_encode(piece, &self.encoder)); } ret } fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>) -> (Vec, usize) { let special_regex = self._get_tl_special_regex(); let regex = self._get_tl_regex(); let mut ret = vec![]; let mut start = 0; let mut last_piece_token_len = 0; loop { let mut next_special; let mut start_find = start; loop { // Find the next allowed special token, if any next_special = special_regex.find_from_pos(text, start_find).unwrap(); match next_special { Some(m) => { if allowed_special.contains(&text[m.start()..m.end()]) { break; } start_find = m.start() + 1; } None => break, } } let end = next_special.map_or(text.len(), |m| m.start()); // Okay, here we go, compare this logic to _encode_ordinary_native for mat in regex.find_iter(&text[start..end]) { let piece = mat.unwrap().as_str().as_bytes(); if let Some(token) = self.encoder.get(piece) { last_piece_token_len = 1; ret.push(*token); continue; } let tokens = byte_pair_encode(piece, &self.encoder); last_piece_token_len = tokens.len(); ret.extend(&tokens); } match next_special { // And here we push the special token Some(m) => { let piece = m.as_str(); let token = self.special_tokens_encoder[piece]; ret.push(token); start = m.end(); last_piece_token_len = 0; } None => break, } } // last_piece_token_len is how many tokens came from the last regex split. This is used // for determining unstable tokens, since you can't merge across (stable) regex splits (ret, last_piece_token_len) } fn _increase_last_piece_token_len( &self, tokens: Vec, mut last_piece_token_len: usize, ) -> (Vec, usize) { // Unfortunately, the locations where our regex splits can be unstable. // For the purposes of determining unstable tokens, unstable regex splitting // is only a problem if a split that was present disappears, since this can // lead to merging of tokens otherwise thought to be stable. // cl100k_base makes our life hard by including the \s*[\r\n]+ // pattern. This can e.g. cause "\n" + " " to become "\n \n". // Here is a quick and dirty fix: { let token_is_all_space = |token| { self.decoder .get(token) .map(|token_bytes| { token_bytes .iter() .rev() .all(|&b| [b' ', b'\n', b'\t'].contains(&b)) }) .unwrap_or(false) }; if last_piece_token_len > 0 && token_is_all_space(&tokens[tokens.len() - last_piece_token_len]) { while (last_piece_token_len < tokens.len()) && token_is_all_space(&tokens[tokens.len() - last_piece_token_len - 1]) { last_piece_token_len += 1; } } } debug_assert!(last_piece_token_len <= tokens.len()); (tokens, last_piece_token_len) } fn _encode_unstable_native( &self, text: &str, allowed_special: &HashSet<&str>, ) -> (Vec, HashSet>) { let (tokens, last_piece_token_len) = self._encode_native(text, allowed_special); if last_piece_token_len == 0 { // If last_piece_token_len is zero, the last token was a special token and we have // no unstable bytes return (tokens, HashSet::new()); } let (mut tokens, last_piece_token_len) = self._increase_last_piece_token_len(tokens, last_piece_token_len); let unstable_bytes = self._decode_native(&tokens[tokens.len() - last_piece_token_len..]); tokens.truncate(tokens.len() - last_piece_token_len); // TODO: we should try harder to find additional stable tokens // This would reduce the amount of retokenising when determining completions // Refer to the logic in an older version of this file let mut completions = HashSet::new(); if unstable_bytes.is_empty() { return (tokens, completions); } // This is the easy bit. Just find all single tokens that start with unstable_bytes // (including tokens that exactly match unstable_bytes) // Separating this from the loop below helps with performance in a common case. let mut point = self .sorted_token_bytes .partition_point(|x| x.as_slice() < unstable_bytes.as_slice()); while point < self.sorted_token_bytes.len() && self.sorted_token_bytes[point].starts_with(&unstable_bytes) { completions.insert(vec![ self.encoder[self.sorted_token_bytes[point].as_slice()], ]); point += 1; } // Now apply even more brute force. At every (other) possible position for the straddling // token, concatenate additional bytes from that token (if any) to unstable_bytes, // and retokenise the whole thing and see what we get. for i in 1..unstable_bytes.len() { let prefix = &unstable_bytes[..i]; let suffix = &unstable_bytes[i..]; let mut point = self .sorted_token_bytes .partition_point(|x| x.as_slice() < suffix); // TODO: Perf optimisation if suffix starts with " "? while point < self.sorted_token_bytes.len() && self.sorted_token_bytes[point].starts_with(suffix) { let possibility = [prefix, self.sorted_token_bytes[point].as_slice()].concat(); let encoded = match std::str::from_utf8(&possibility) { // Morally, this is byte_pair_encode(&possibility, &self.encoder) // But we might have introduced a regex split which would prevent merges. // (particularly possible in the presence of unstable regex splits) // So convert to UTF-8 and do regex splitting. // E.g. with cl100k_base " !" gets split to " " + " !", // but byte_pair_encode(" !") != byte_pair_encode(" ") Ok(s) => self._encode_ordinary_native(s), // Technically, whether or not this arm is correct depends on whether there // would be a regex split before the UTF-8 truncation point. // Probably niche enough that no one will ever notice (after all, people didn't // notice all the big holes in the previous unstable token implementation) Err(_) => byte_pair_encode(&possibility, &self.encoder), // Something like the following is intriguing but incorrect: // Err(e) => self._encode_ordinary_native(unsafe { // std::str::from_utf8_unchecked(&possibility[..e.valid_up_to()]) // }), }; let mut seq = Vec::new(); let mut seq_len = 0; for token in encoded { seq.push(token); seq_len += self.decoder[&token].len(); if seq_len >= unstable_bytes.len() { break; } } completions.insert(seq); point += 1; } } // This is also not straightforward. While we generally assume that regex splits are stable, // unfortunately, they are not. That is, if adding bytes were to make a split appear in // unstable_bytes, this could make tokens possible which our logic would otherwise think // would be merged. // For example, with gpt2, the use of \s+(?!\S) means that "\n\n" could // develop a split, e.g. "\n\n0" splits into "\n"+"\n"+"0", making "\n" a possible token. // Here is a quick and dirty fix: // This isn't right if we ever remove \s+(?!\S) if unstable_bytes.len() > 1 { let last_decoded = bstr::decode_last_utf8(unstable_bytes.as_slice()); if unstable_bytes.len() - last_decoded.1 > 0 && last_decoded.0.map_or(false, |c| c.is_whitespace()) { let mut reencoded = byte_pair_encode( &unstable_bytes[..unstable_bytes.len() - last_decoded.1], &self.encoder, ); reencoded.extend(byte_pair_encode( &unstable_bytes[unstable_bytes.len() - last_decoded.1..], &self.encoder, )); completions.insert(reencoded); } } (tokens, completions) } } #[pymethods] impl CoreBPE { #[new] fn new( encoder: HashMap, usize>, special_tokens_encoder: HashMap, pattern: &str, ) -> PyResult { let regex = Regex::new(pattern) .map_err(|e| PyErr::new::(e.to_string()))?; let special_regex = { let _parts = special_tokens_encoder .keys() .map(|s| fancy_regex::escape(s)) .collect::>(); Regex::new(&_parts.join("|")) .map_err(|e| PyErr::new::(e.to_string()))? }; let decoder: HashMap> = encoder.iter().map(|(k, v)| (*v, k.clone())).collect(); assert!( encoder.len() == decoder.len(), "Encoder and decoder must be of equal length; maybe you had duplicate token indices in your encoder?" ); let special_tokens_decoder: HashMap> = special_tokens_encoder .iter() .map(|(k, v)| (*v, k.as_bytes().to_vec())) .collect(); // Clone because I don't know how to tell Rust I'm not going to change the map let mut sorted_token_bytes: Vec> = encoder.keys().cloned().collect(); sorted_token_bytes.sort(); Ok(CoreBPE { encoder, special_tokens_encoder, decoder, special_tokens_decoder, regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(), special_regex_tls: (0..MAX_NUM_THREADS) .map(|_| special_regex.clone()) .collect(), sorted_token_bytes, }) } // ==================== // Encoding // ==================== fn encode_ordinary(&self, py: Python, text: &str) -> Vec { py.allow_threads(|| self._encode_ordinary_native(text)) } fn encode(&self, py: Python, text: &str, allowed_special: HashSet<&str>) -> Vec { py.allow_threads(|| self._encode_native(text, &allowed_special).0) } fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec { py.allow_threads(|| { match std::str::from_utf8(bytes) { Ok(text) => self._encode_ordinary_native(text), Err(e) => { let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) }; let (tokens, last_piece_token_len) = self._encode_native(text, &HashSet::new()); let (mut tokens, last_piece_token_len) = self._increase_last_piece_token_len(tokens, last_piece_token_len); if !tokens.is_empty() && last_piece_token_len > 0 { // Lop off the tokens from the last piece and run BPE on the remaining bytes // Somewhat niche, but this may not be correct if we'd have had a regex // split between the valid UTF-8 and the invalid bytes, which is why this // method is private let mut unstable_bytes = self._decode_native(&tokens[tokens.len() - last_piece_token_len..]); unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]); tokens.truncate(tokens.len() - last_piece_token_len); tokens.extend(byte_pair_encode(&unstable_bytes, &self.encoder)); } tokens } } }) } fn encode_with_unstable( &self, py: Python, text: &str, allowed_special: HashSet<&str>, ) -> Py { let (tokens, completions) = py.allow_threads(|| self._encode_unstable_native(text, &allowed_special)); let py_completions = PyList::new(py, completions.iter().map(|seq| PyList::new(py, &seq[..]))); (tokens, py_completions).into_py(py) } fn encode_single_token(&self, piece: &[u8]) -> PyResult { if let Some(token) = self.encoder.get(piece).copied() { return Ok(token); } if let Ok(piece_str) = std::str::from_utf8(piece) { if let Some(token) = self.special_tokens_encoder.get(piece_str).copied() { return Ok(token); } } Err(PyErr::new::(piece.to_owned())) } fn encode_single_piece(&self, piece: &[u8]) -> Vec { if let Some(token) = self.encoder.get(piece) { return vec![*token]; } byte_pair_encode(piece, &self.encoder) } // ==================== // Decoding // ==================== fn decode_bytes(&self, py: Python, tokens: Vec) -> Py { let bytes = py.allow_threads(|| self._decode_native(&tokens)); PyBytes::new(py, &bytes).into() } fn decode_single_token_bytes(&self, py: Python, token: usize) -> PyResult> { if let Some(bytes) = self.decoder.get(&token) { return Ok(PyBytes::new(py, bytes).into()); } if let Some(bytes) = self.special_tokens_decoder.get(&token) { return Ok(PyBytes::new(py, bytes).into()); } Err(PyErr::new::(token.to_string())) } // ==================== // Miscellaneous // ==================== fn token_byte_values(&self, py: Python) -> Vec> { self.sorted_token_bytes .iter() .map(|x| PyBytes::new(py, x).into()) .collect() } } #[pymodule] fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; Ok(()) } #[cfg(test)] mod tests { use rustc_hash::FxHashMap as HashMap; use crate::byte_pair_split; #[test] fn very_simple_test() { let mut ranks = HashMap::default(); ranks.insert(b"ab".to_vec(), 1); ranks.insert(b"cd".to_vec(), 2); let res = byte_pair_split(b"abcd", &ranks); assert_eq!(res, vec![b"ab", b"cd"]); } } tiktoken-0.6.0/tests/000077500000000000000000000000001456133667000145225ustar00rootroot00000000000000tiktoken-0.6.0/tests/__init__.py000066400000000000000000000000001456133667000166210ustar00rootroot00000000000000tiktoken-0.6.0/tests/test_encoding.py000066400000000000000000000164361456133667000177330ustar00rootroot00000000000000# Note that there are more actual tests, they're just not currently public :-) from typing import Callable import hypothesis import hypothesis.strategies as st import pytest import tiktoken from .test_helpers import ENCODING_FACTORIES, MAX_EXAMPLES def test_simple(): enc = tiktoken.get_encoding("gpt2") assert enc.encode("hello world") == [31373, 995] assert enc.decode([31373, 995]) == "hello world" assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256] enc = tiktoken.get_encoding("cl100k_base") assert enc.encode("hello world") == [15339, 1917] assert enc.decode([15339, 1917]) == "hello world" assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257] for enc_name in tiktoken.list_encoding_names(): enc = tiktoken.get_encoding(enc_name) for token in range(10_000): assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token def test_simple_repeated(): enc = tiktoken.get_encoding("gpt2") assert enc.encode("0") == [15] assert enc.encode("00") == [405] assert enc.encode("000") == [830] assert enc.encode("0000") == [2388] assert enc.encode("00000") == [20483] assert enc.encode("000000") == [10535] assert enc.encode("0000000") == [24598] assert enc.encode("00000000") == [8269] assert enc.encode("000000000") == [10535, 830] assert enc.encode("0000000000") == [8269, 405] assert enc.encode("00000000000") == [8269, 830] assert enc.encode("000000000000") == [8269, 2388] assert enc.encode("0000000000000") == [8269, 20483] assert enc.encode("00000000000000") == [8269, 10535] assert enc.encode("000000000000000") == [8269, 24598] assert enc.encode("0000000000000000") == [25645] assert enc.encode("00000000000000000") == [8269, 10535, 830] def test_simple_regex(): enc = tiktoken.get_encoding("cl100k_base") assert enc.encode("rer") == [38149] assert enc.encode("'rer") == [2351, 81] assert enc.encode("today\n ") == [31213, 198, 220] assert enc.encode("today\n \n") == [31213, 27907] assert enc.encode("today\n \n") == [31213, 14211] def test_basic_encode(): enc = tiktoken.get_encoding("r50k_base") assert enc.encode("hello world") == [31373, 995] enc = tiktoken.get_encoding("p50k_base") assert enc.encode("hello world") == [31373, 995] enc = tiktoken.get_encoding("cl100k_base") assert enc.encode("hello world") == [15339, 1917] assert enc.encode(" \x850") == [220, 126, 227, 15] def test_encode_empty(): enc = tiktoken.get_encoding("r50k_base") assert enc.encode("") == [] def test_encode_bytes(): enc = tiktoken.get_encoding("cl100k_base") assert enc._encode_bytes(b" \xec\x8b\xa4\xed") == [62085] def test_encode_surrogate_pairs(): enc = tiktoken.get_encoding("cl100k_base") assert enc.encode("👍") == [9468, 239, 235] # surrogate pair gets converted to codepoint assert enc.encode("\ud83d\udc4d") == [9468, 239, 235] # lone surrogate just gets replaced assert enc.encode("\ud83d") == enc.encode("�") # ==================== # Roundtrip # ==================== @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) def test_basic_roundtrip(make_enc): enc = make_enc() for value in ( "hello", "hello ", "hello ", " hello", " hello ", " hello ", "hello world", "请考试我的软件!12345", ): assert value == enc.decode(enc.encode(value)) assert value == enc.decode(enc.encode_ordinary(value)) @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) @hypothesis.given(text=st.text()) @hypothesis.settings(deadline=None) def test_hyp_roundtrip(make_enc: Callable[[], tiktoken.Encoding], text): enc = make_enc() assert text == enc.decode(enc.encode(text)) @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) def test_single_token_roundtrip(make_enc: Callable[[], tiktoken.Encoding]): enc = make_enc() for token in range(enc.n_vocab): try: token_bytes = enc.decode_single_token_bytes(token) except KeyError: continue assert enc.encode_single_token(token_bytes) == token # ==================== # Special tokens # ==================== def test_special_token(): enc = tiktoken.get_encoding("cl100k_base") eot = enc.encode_single_token("<|endoftext|>") assert eot == enc.eot_token fip = enc.encode_single_token("<|fim_prefix|>") fim = enc.encode_single_token("<|fim_middle|>") text = "<|endoftext|> hello <|fim_prefix|>" assert eot not in enc.encode(text, disallowed_special=()) with pytest.raises(ValueError): enc.encode(text) with pytest.raises(ValueError): enc.encode(text, disallowed_special="all") with pytest.raises(ValueError): enc.encode(text, disallowed_special={"<|endoftext|>"}) with pytest.raises(ValueError): enc.encode(text, disallowed_special={"<|fim_prefix|>"}) text = "<|endoftext|> hello <|fim_prefix|> there <|fim_middle|>" tokens = enc.encode(text, disallowed_special=()) assert eot not in tokens assert fip not in tokens assert fim not in tokens tokens = enc.encode(text, allowed_special="all", disallowed_special=()) assert eot in tokens assert fip in tokens assert fim in tokens tokens = enc.encode(text, allowed_special="all", disallowed_special="all") assert eot in tokens assert fip in tokens assert fim in tokens tokens = enc.encode(text, allowed_special={"<|fim_prefix|>"}, disallowed_special=()) assert eot not in tokens assert fip in tokens assert fim not in tokens tokens = enc.encode(text, allowed_special={"<|endoftext|>"}, disallowed_special=()) assert eot in tokens assert fip not in tokens assert fim not in tokens tokens = enc.encode(text, allowed_special={"<|fim_middle|>"}, disallowed_special=()) assert eot not in tokens assert fip not in tokens assert fim in tokens @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) @hypothesis.given(text=st.text()) @hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES) def test_hyp_special_ordinary(make_enc, text: str): enc = make_enc() assert enc.encode_ordinary(text) == enc.encode(text, disallowed_special=()) # ==================== # Batch encoding # ==================== @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) def test_batch_encode(make_enc: Callable[[], tiktoken.Encoding]): enc = make_enc() text1 = "hello world" text2 = "goodbye world" assert enc.encode_batch([text1]) == [enc.encode(text1)] assert enc.encode_batch([text1, text2]) == [enc.encode(text1), enc.encode(text2)] assert enc.encode_ordinary_batch([text1]) == [enc.encode_ordinary(text1)] assert enc.encode_ordinary_batch([text1, text2]) == [ enc.encode_ordinary(text1), enc.encode_ordinary(text2), ] @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) @hypothesis.given(batch=st.lists(st.text())) @hypothesis.settings(deadline=None) def test_hyp_batch_roundtrip(make_enc: Callable[[], tiktoken.Encoding], batch): enc = make_enc() encoded = enc.encode_batch(batch) assert encoded == [enc.encode(t) for t in batch] decoded = enc.decode_batch(encoded) assert decoded == batch tiktoken-0.6.0/tests/test_helpers.py000066400000000000000000000007351456133667000176020ustar00rootroot00000000000000import bisect import functools import os import pytest import tiktoken MAX_EXAMPLES: int = int(os.environ.get("TIKTOKEN_MAX_EXAMPLES", "100")) ENCODINGS = ["r50k_base", "cl100k_base"] SOME_ENCODINGS = ["cl100k_base"] ENCODING_FACTORIES = [ pytest.param(functools.partial(tiktoken.get_encoding, name), id=name) for name in ENCODINGS ] SOME_ENCODING_FACTORIES = [ pytest.param(functools.partial(tiktoken.get_encoding, name), id=name) for name in SOME_ENCODINGS ] tiktoken-0.6.0/tests/test_misc.py000066400000000000000000000011611456133667000170650ustar00rootroot00000000000000import subprocess import sys import tiktoken def test_encoding_for_model(): enc = tiktoken.encoding_for_model("gpt2") assert enc.name == "gpt2" enc = tiktoken.encoding_for_model("text-davinci-003") assert enc.name == "p50k_base" enc = tiktoken.encoding_for_model("text-davinci-edit-001") assert enc.name == "p50k_edit" enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301") assert enc.name == "cl100k_base" def test_optional_blobfile_dependency(): prog = """ import tiktoken import sys assert "blobfile" not in sys.modules """ subprocess.check_call([sys.executable, "-c", prog]) tiktoken-0.6.0/tests/test_offsets.py000066400000000000000000000051251456133667000176070ustar00rootroot00000000000000from typing import Callable import hypothesis import pytest from hypothesis import strategies as st import tiktoken from .test_helpers import MAX_EXAMPLES, SOME_ENCODING_FACTORIES def _common_prefix_len(a, b): i = 0 while i < len(a) and i < len(b) and a[i] == b[i]: i += 1 return i def _token_offsets_reference(enc, tokens): text = enc.decode(tokens, errors="strict") res = [] for i in range(len(tokens)): prefix = enc.decode(tokens[:i], errors="ignore") res.append(_common_prefix_len(text, prefix)) return res @pytest.mark.parametrize("make_enc", SOME_ENCODING_FACTORIES) @hypothesis.given(data=st.data()) @hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES) def test_hyp_offsets(make_enc: Callable[[], tiktoken.Encoding], data): enc = make_enc() tokens_st = st.lists( st.integers(0, enc.n_vocab - 1).filter( lambda x: x in enc._special_tokens.values() or x in enc._mergeable_ranks.values() ), min_size=1, max_size=20, ) tokens = data.draw(tokens_st) # This is a dumb hack to make sure that our tokens are a valid UTF-8 string # We could potentially drop this, see the TODO in decode_with_offsets tokens = enc.encode(enc.decode(tokens, errors="ignore"), allowed_special="all") assert enc.decode_with_offsets(tokens)[1] == _token_offsets_reference(enc, tokens) def test_basic_offsets(): enc = tiktoken.get_encoding("cl100k_base") prompt = "hello world" p, o = enc.decode_with_offsets(enc.encode(prompt)) assert p == prompt assert o == [0, 5] prompt = "hello world<|endoftext|> green cow" p, o = enc.decode_with_offsets(enc.encode(prompt, allowed_special="all")) assert p == prompt assert o == [0, 5, 11, 24, 30] prompt = "我非常渴望与人工智能一起工作" p, o = enc.decode_with_offsets(enc.encode(prompt)) assert p == prompt assert o == [0, 1, 2, 3, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13] # contains the interesting tokens b'\xe0\xae\xbf\xe0\xae' and b'\xe0\xaf\x8d\xe0\xae' # in which \xe0 is the start of a 3-byte UTF-8 character prompt = "நடிகர் சூர்யா" p, o = enc.decode_with_offsets(enc.encode(prompt)) assert p == prompt assert o == [0, 0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 11, 12, 12] # contains the interesting token b'\xa0\xe9\x99\xa4' # in which \xe9 is the start of a 3-byte UTF-8 character and \xa0 is a continuation byte prompt = " Ġ除" p, o = enc.decode_with_offsets(enc.encode(prompt)) assert p == prompt assert o == [0, 1] tiktoken-0.6.0/tests/test_simple_public.py000066400000000000000000000026371456133667000207720ustar00rootroot00000000000000import subprocess import sys import tiktoken def test_simple(): # Note that there are more actual tests, they're just not currently public :-) enc = tiktoken.get_encoding("gpt2") assert enc.encode("hello world") == [31373, 995] assert enc.decode([31373, 995]) == "hello world" assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256] enc = tiktoken.get_encoding("cl100k_base") assert enc.encode("hello world") == [15339, 1917] assert enc.decode([15339, 1917]) == "hello world" assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257] for enc_name in tiktoken.list_encoding_names(): enc = tiktoken.get_encoding(enc_name) for token in range(10_000): assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token def test_encoding_for_model(): enc = tiktoken.encoding_for_model("gpt2") assert enc.name == "gpt2" enc = tiktoken.encoding_for_model("text-davinci-003") assert enc.name == "p50k_base" enc = tiktoken.encoding_for_model("text-davinci-edit-001") assert enc.name == "p50k_edit" enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301") assert enc.name == "cl100k_base" def test_optional_blobfile_dependency(): prog = """ import tiktoken import sys assert "blobfile" not in sys.modules """ subprocess.check_call([sys.executable, "-c", prog]) tiktoken-0.6.0/tiktoken/000077500000000000000000000000001456133667000152105ustar00rootroot00000000000000tiktoken-0.6.0/tiktoken/__init__.py000066400000000000000000000005021456133667000173160ustar00rootroot00000000000000# This is the public API of tiktoken from .core import Encoding as Encoding from .model import encoding_for_model as encoding_for_model from .model import encoding_name_for_model as encoding_name_for_model from .registry import get_encoding as get_encoding from .registry import list_encoding_names as list_encoding_names tiktoken-0.6.0/tiktoken/_educational.py000066400000000000000000000200521456133667000202100ustar00rootroot00000000000000"""This is an educational implementation of the byte pair encoding algorithm.""" import collections from typing import Optional import regex import tiktoken class SimpleBytePairEncoding: def __init__(self, *, pat_str: str, mergeable_ranks: dict[bytes, int]) -> None: """Creates an Encoding object.""" # A regex pattern string that is used to split the input text self.pat_str = pat_str # A dictionary mapping token bytes to their ranks. The ranks correspond to merge priority self.mergeable_ranks = mergeable_ranks self._decoder = {token: token_bytes for token_bytes, token in mergeable_ranks.items()} self._pat = regex.compile(pat_str) def encode(self, text: str, visualise: Optional[str] = "colour") -> list[int]: """Encodes a string into tokens. >>> enc.encode("hello world") [388, 372] """ # Use the regex to split the text into (approximately) words words = self._pat.findall(text) tokens = [] for word in words: # Turn each word into tokens, using the byte pair encoding algorithm word_bytes = word.encode("utf-8") word_tokens = bpe_encode(self.mergeable_ranks, word_bytes, visualise=visualise) tokens.extend(word_tokens) return tokens def decode_bytes(self, tokens: list[int]) -> bytes: """Decodes a list of tokens into bytes. >>> enc.decode_bytes([388, 372]) b'hello world' """ return b"".join(self._decoder[token] for token in tokens) def decode(self, tokens: list[int]) -> str: """Decodes a list of tokens into a string. Decoded bytes are not guaranteed to be valid UTF-8. In that case, we replace the invalid bytes with the replacement character "�". >>> enc.decode([388, 372]) 'hello world' """ return self.decode_bytes(tokens).decode("utf-8", errors="replace") def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]: """Decodes a list of tokens into a list of bytes. Useful for visualising how a string is tokenised. >>> enc.decode_tokens_bytes([388, 372]) [b'hello', b' world'] """ return [self._decoder[token] for token in tokens] @staticmethod def train(training_data: str, vocab_size: int, pat_str: str): """Train a BPE tokeniser on some data!""" mergeable_ranks = bpe_train(data=training_data, vocab_size=vocab_size, pat_str=pat_str) return SimpleBytePairEncoding(pat_str=pat_str, mergeable_ranks=mergeable_ranks) @staticmethod def from_tiktoken(encoding): if isinstance(encoding, str): encoding = tiktoken.get_encoding(encoding) return SimpleBytePairEncoding( pat_str=encoding._pat_str, mergeable_ranks=encoding._mergeable_ranks ) def bpe_encode( mergeable_ranks: dict[bytes, int], input: bytes, visualise: Optional[str] = "colour" ) -> list[int]: parts = [bytes([b]) for b in input] while True: # See the intermediate merges play out! if visualise: if visualise in ["colour", "color"]: visualise_tokens(parts) elif visualise == "simple": print(parts) # Iterate over all pairs and find the pair we want to merge the most min_idx = None min_rank = None for i, pair in enumerate(zip(parts[:-1], parts[1:])): rank = mergeable_ranks.get(pair[0] + pair[1]) if rank is not None and (min_rank is None or rank < min_rank): min_idx = i min_rank = rank # If there were no pairs we could merge, we're done! if min_rank is None: break assert min_idx is not None # Otherwise, merge that pair and leave the rest unchanged. Then repeat. parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :] if visualise: print() tokens = [mergeable_ranks[part] for part in parts] return tokens def bpe_train( data: str, vocab_size: int, pat_str: str, visualise: Optional[str] = "colour" ) -> dict[bytes, int]: # First, add tokens for each individual byte value if vocab_size < 2**8: raise ValueError("vocab_size must be at least 256, so we can encode all bytes") ranks = {} for i in range(2**8): ranks[bytes([i])] = i # Splinter up our data into lists of bytes # data = "Hello world" # words = [ # [b'H', b'e', b'l', b'l', b'o'], # [b' ', b'w', b'o', b'r', b'l', b'd'] # ] words: list[list[bytes]] = [ [bytes([b]) for b in word.encode("utf-8")] for word in regex.findall(pat_str, data) ] # Now, use our data to figure out which merges we should make while len(ranks) < vocab_size: # Find the most common pair. This will become our next token stats = collections.Counter() for piece in words: for pair in zip(piece[:-1], piece[1:]): stats[pair] += 1 most_common_pair = max(stats, key=lambda x: stats[x]) token_bytes = most_common_pair[0] + most_common_pair[1] token = len(ranks) # Add the new token! ranks[token_bytes] = token # Now merge that most common pair in all the words. That is, update our training data # to reflect our decision to make that pair into a new token. new_words = [] for word in words: new_word = [] i = 0 while i < len(word) - 1: if (word[i], word[i + 1]) == most_common_pair: # We found our pair! Merge it new_word.append(token_bytes) i += 2 else: new_word.append(word[i]) i += 1 if i == len(word) - 1: new_word.append(word[i]) new_words.append(new_word) words = new_words # See the intermediate merges play out! if visualise: print(f"The current most common pair is {most_common_pair[0]} + {most_common_pair[1]}") print(f"So we made {token_bytes} our {len(ranks)}th token") if visualise in ["colour", "color"]: print("Now the first fifty words in our training data look like:") visualise_tokens([token for word in words[:50] for token in word]) elif visualise == "simple": print("Now the first twenty words in our training data look like:") for word in words[:20]: print(word) print("\n") return ranks def visualise_tokens(token_values: list[bytes]) -> None: background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]] # If token boundaries do not occur at unicode character boundaries, it's unclear how best to # visualise the token. Here, we'll just use the unicode replacement character to represent some # fraction of a character. unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values] running_length = 0 last_color = None for token in unicode_token_values: color = background[running_length % len(background)] if color == last_color: color = background[(running_length + 1) % len(background)] assert color != last_color last_color = color running_length += len(token) print(color + token, end="") print("\u001b[0m") def train_simple_encoding(): gpt2_pattern = ( r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ) with open(__file__, "r") as f: data = f.read() enc = SimpleBytePairEncoding.train(data, vocab_size=600, pat_str=gpt2_pattern) print("This is the sequence of merges performed in order to encode 'hello world':") tokens = enc.encode("hello world") assert enc.decode(tokens) == "hello world" assert enc.decode_bytes(tokens) == b"hello world" assert enc.decode_tokens_bytes(tokens) == [b"hello", b" world"] return enc tiktoken-0.6.0/tiktoken/core.py000066400000000000000000000373731456133667000165270ustar00rootroot00000000000000from __future__ import annotations import functools from concurrent.futures import ThreadPoolExecutor from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union import regex from tiktoken import _tiktoken class Encoding: def __init__( self, name: str, *, pat_str: str, mergeable_ranks: dict[bytes, int], special_tokens: dict[str, int], explicit_n_vocab: Optional[int] = None, ): """Creates an Encoding object. See openai_public.py for examples of how to construct an Encoding object. Args: name: The name of the encoding. It should be clear from the name of the encoding what behaviour to expect, in particular, encodings with different special tokens should have different names. pat_str: A regex pattern string that is used to split the input text. mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks must correspond to merge priority. special_tokens: A dictionary mapping special token strings to their token values. explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked that the number of mergeable tokens and special tokens is equal to this number. """ self.name = name self._pat_str = pat_str self._mergeable_ranks = mergeable_ranks self._special_tokens = special_tokens self.max_token_value = max( max(mergeable_ranks.values()), max(special_tokens.values(), default=0) ) if explicit_n_vocab: assert len(mergeable_ranks) + len(special_tokens) == explicit_n_vocab assert self.max_token_value == explicit_n_vocab - 1 self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str) def __repr__(self) -> str: return f"" # ==================== # Encoding # ==================== def encode_ordinary(self, text: str) -> list[int]: """Encodes a string into tokens, ignoring special tokens. This is equivalent to `encode(text, disallowed_special=())` (but slightly faster). ``` >>> enc.encode_ordinary("hello world") [31373, 995] """ try: return self._core_bpe.encode_ordinary(text) except UnicodeEncodeError: # See comment in encode text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace") return self._core_bpe.encode_ordinary(text) def encode( self, text: str, *, allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006 disallowed_special: Union[Literal["all"], Collection[str]] = "all", ) -> list[int]: """Encodes a string into tokens. Special tokens are artificial tokens used to unlock capabilities from a model, such as fill-in-the-middle. So we want to be careful about accidentally encoding special tokens, since they can be used to trick a model into doing something we don't want it to do. Hence, by default, encode will raise an error if it encounters text that corresponds to a special token. This can be controlled on a per-token level using the `allowed_special` and `disallowed_special` parameters. In particular: - Setting `disallowed_special` to () will prevent this function from raising errors and cause all text corresponding to special tokens to be encoded as natural text. - Setting `allowed_special` to "all" will cause this function to treat all text corresponding to special tokens to be encoded as special tokens. ``` >>> enc.encode("hello world") [31373, 995] >>> enc.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}) [50256] >>> enc.encode("<|endoftext|>", allowed_special="all") [50256] >>> enc.encode("<|endoftext|>") # Raises ValueError >>> enc.encode("<|endoftext|>", disallowed_special=()) [27, 91, 437, 1659, 5239, 91, 29] ``` """ if allowed_special == "all": allowed_special = self.special_tokens_set if disallowed_special == "all": disallowed_special = self.special_tokens_set - allowed_special if disallowed_special: if not isinstance(disallowed_special, frozenset): disallowed_special = frozenset(disallowed_special) if match := _special_token_regex(disallowed_special).search(text): raise_disallowed_special_token(match.group()) # https://github.com/PyO3/pyo3/pull/3632 if isinstance(allowed_special, frozenset): allowed_special = set(allowed_special) try: return self._core_bpe.encode(text, allowed_special) except UnicodeEncodeError: # BPE operates on bytes, but the regex operates on unicode. If we pass a str that is # invalid UTF-8 to Rust, it will rightfully complain. Here we do a quick and dirty # fixup for any surrogate pairs that may have sneaked their way into the text. # Technically, this introduces a place where encode + decode doesn't roundtrip a Python # string, but given that this is input we want to support, maybe that's okay. # Also we use errors="replace" to handle weird things like lone surrogates. text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace") return self._core_bpe.encode(text, allowed_special) def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]: """Encodes a list of strings into tokens, in parallel, ignoring special tokens. This is equivalent to `encode_batch(text, disallowed_special=())` (but slightly faster). ``` >>> enc.encode_ordinary_batch(["hello world", "goodbye world"]) [[31373, 995], [11274, 16390, 995]] ``` """ encoder = functools.partial(self.encode_ordinary) with ThreadPoolExecutor(num_threads) as e: return list(e.map(encoder, text)) def encode_batch( self, text: list[str], *, num_threads: int = 8, allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006 disallowed_special: Union[Literal["all"], Collection[str]] = "all", ) -> list[list[int]]: """Encodes a list of strings into tokens, in parallel. See `encode` for more details on `allowed_special` and `disallowed_special`. ``` >>> enc.encode_batch(["hello world", "goodbye world"]) [[31373, 995], [11274, 16390, 995]] ``` """ if allowed_special == "all": allowed_special = self.special_tokens_set if disallowed_special == "all": disallowed_special = self.special_tokens_set - allowed_special if not isinstance(disallowed_special, frozenset): disallowed_special = frozenset(disallowed_special) encoder = functools.partial( self.encode, allowed_special=allowed_special, disallowed_special=disallowed_special ) with ThreadPoolExecutor(num_threads) as e: return list(e.map(encoder, text)) def encode_with_unstable( self, text: str, *, allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006 disallowed_special: Union[Literal["all"], Collection[str]] = "all", ) -> tuple[list[int], list[list[int]]]: """Encodes a string into stable tokens and possible completion sequences. Note that the stable tokens will only represent a substring of `text`. See `encode` for more details on `allowed_special` and `disallowed_special`. This API should itself be considered unstable. ``` >>> enc.encode_with_unstable("hello fanta") ([31373], [(277, 4910), (5113, 265), ..., (8842,)]) >>> text = "..." >>> stable_tokens, completions = enc.encode_with_unstable(text) >>> assert text.encode().startswith(enc.decode_bytes(stable_tokens)) >>> assert all(enc.decode_bytes(stable_tokens + seq).startswith(text.encode()) for seq in completions) ``` """ if allowed_special == "all": allowed_special = self.special_tokens_set if disallowed_special == "all": disallowed_special = self.special_tokens_set - allowed_special if disallowed_special: if not isinstance(disallowed_special, frozenset): disallowed_special = frozenset(disallowed_special) if match := _special_token_regex(disallowed_special).search(text): raise_disallowed_special_token(match.group()) return self._core_bpe.encode_with_unstable(text, allowed_special) def encode_single_token(self, text_or_bytes: Union[str, bytes]) -> int: """Encodes text corresponding to a single token to its token value. NOTE: this will encode all special tokens. Raises `KeyError` if the token is not in the vocabulary. ``` >>> enc.encode_single_token("hello") 31373 ``` """ if isinstance(text_or_bytes, str): text_or_bytes = text_or_bytes.encode("utf-8") return self._core_bpe.encode_single_token(text_or_bytes) # ==================== # Decoding # ==================== def decode_bytes(self, tokens: list[int]) -> bytes: """Decodes a list of tokens into bytes. ``` >>> enc.decode_bytes([31373, 995]) b'hello world' ``` """ return self._core_bpe.decode_bytes(tokens) def decode(self, tokens: list[int], errors: str = "replace") -> str: """Decodes a list of tokens into a string. WARNING: the default behaviour of this function is lossy, since decoded bytes are not guaranteed to be valid UTF-8. You can control this behaviour using the `errors` parameter, for instance, setting `errors=strict`. ``` >>> enc.decode([31373, 995]) 'hello world' ``` """ return self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors) def decode_single_token_bytes(self, token: int) -> bytes: """Decodes a token into bytes. NOTE: this will decode all special tokens. Raises `KeyError` if the token is not in the vocabulary. ``` >>> enc.decode_single_token_bytes(31373) b'hello' ``` """ return self._core_bpe.decode_single_token_bytes(token) def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]: """Decodes a list of tokens into a list of bytes. Useful for visualising tokenisation. >>> enc.decode_tokens_bytes([31373, 995]) [b'hello', b' world'] """ return [self.decode_single_token_bytes(token) for token in tokens] def decode_with_offsets(self, tokens: list[int]) -> tuple[str, list[int]]: """Decodes a list of tokens into a string and a list of offsets. Each offset is the index into text corresponding to the start of each token. If UTF-8 character boundaries do not line up with token boundaries, the offset is the index of the first character that contains bytes from the token. This will currently raise if given tokens that decode to invalid UTF-8; this behaviour may change in the future to be more permissive. >>> enc.decode_with_offsets([31373, 995]) ('hello world', [0, 5]) """ token_bytes = self.decode_tokens_bytes(tokens) text_len = 0 offsets = [] for token in token_bytes: offsets.append(max(0, text_len - (0x80 <= token[0] < 0xC0))) text_len += sum(1 for c in token if not 0x80 <= c < 0xC0) # TODO: assess correctness for errors="ignore" and errors="replace" text = b"".join(token_bytes).decode("utf-8", errors="strict") return text, offsets def decode_batch( self, batch: list[list[int]], *, errors: str = "replace", num_threads: int = 8 ) -> list[str]: """Decodes a batch (list of lists of tokens) into a list of strings.""" decoder = functools.partial(self.decode, errors=errors) with ThreadPoolExecutor(num_threads) as e: return list(e.map(decoder, batch)) def decode_bytes_batch(self, batch: list[list[int]], *, num_threads: int = 8) -> list[bytes]: """Decodes a batch (list of lists of tokens) into a list of bytes.""" with ThreadPoolExecutor(num_threads) as e: return list(e.map(self.decode_bytes, batch)) # ==================== # Miscellaneous # ==================== def token_byte_values(self) -> list[bytes]: """Returns the list of all token byte values.""" return self._core_bpe.token_byte_values() @property def eot_token(self) -> int: return self._special_tokens["<|endoftext|>"] @functools.cached_property def special_tokens_set(self) -> set[str]: return set(self._special_tokens.keys()) @property def n_vocab(self) -> int: """For backwards compatibility. Prefer to use `enc.max_token_value + 1`.""" return self.max_token_value + 1 # ==================== # Private # ==================== def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> list[int]: """Encodes text corresponding to bytes without a regex split. NOTE: this will not encode any special tokens. ``` >>> enc.encode_single_piece("helloqqqq") [31373, 38227, 38227] ``` """ if isinstance(text_or_bytes, str): text_or_bytes = text_or_bytes.encode("utf-8") return self._core_bpe.encode_single_piece(text_or_bytes) def _encode_only_native_bpe(self, text: str) -> list[int]: """Encodes a string into tokens, but do regex splitting in Python.""" _unused_pat = regex.compile(self._pat_str) ret = [] for piece in regex.findall(_unused_pat, text): ret.extend(self._core_bpe.encode_single_piece(piece)) return ret def _encode_bytes(self, text: bytes) -> list[int]: return self._core_bpe._encode_bytes(text) def __getstate__(self) -> object: import tiktoken.registry # As an optimisation, pickle registered encodings by reference if self is tiktoken.registry.ENCODINGS.get(self.name): return self.name return { "name": self.name, "pat_str": self._pat_str, "mergeable_ranks": self._mergeable_ranks, "special_tokens": self._special_tokens, } def __setstate__(self, value: object) -> None: import tiktoken.registry if isinstance(value, str): self.__dict__ = tiktoken.registry.get_encoding(value).__dict__ return self.__init__(**value) @functools.lru_cache(maxsize=128) def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]": inner = "|".join(regex.escape(token) for token in tokens) return regex.compile(f"({inner})") def raise_disallowed_special_token(token: str) -> NoReturn: raise ValueError( f"Encountered text corresponding to disallowed special token {token!r}.\n" "If you want this text to be encoded as a special token, " f"pass it to `allowed_special`, e.g. `allowed_special={{{token!r}, ...}}`.\n" f"If you want this text to be encoded as normal text, disable the check for this token " f"by passing `disallowed_special=(enc.special_tokens_set - {{{token!r}}})`.\n" "To disable this check for all special tokens, pass `disallowed_special=()`.\n" ) tiktoken-0.6.0/tiktoken/load.py000066400000000000000000000123471456133667000165100ustar00rootroot00000000000000from __future__ import annotations import base64 import hashlib import json import os import tempfile import uuid from typing import Optional import requests def read_file(blobpath: str) -> bytes: if not blobpath.startswith("http://") and not blobpath.startswith("https://"): try: import blobfile except ImportError as e: raise ImportError( "blobfile is not installed. Please install it by running `pip install blobfile`." ) from e with blobfile.BlobFile(blobpath, "rb") as f: return f.read() # avoiding blobfile for public files helps avoid auth issues, like MFA prompts resp = requests.get(blobpath) resp.raise_for_status() return resp.content def check_hash(data: bytes, expected_hash: str) -> bool: actual_hash = hashlib.sha256(data).hexdigest() return actual_hash == expected_hash def read_file_cached(blobpath: str, expected_hash: Optional[str] = None) -> bytes: user_specified_cache = True if "TIKTOKEN_CACHE_DIR" in os.environ: cache_dir = os.environ["TIKTOKEN_CACHE_DIR"] elif "DATA_GYM_CACHE_DIR" in os.environ: cache_dir = os.environ["DATA_GYM_CACHE_DIR"] else: cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache") user_specified_cache = False if cache_dir == "": # disable caching return read_file(blobpath) cache_key = hashlib.sha1(blobpath.encode()).hexdigest() cache_path = os.path.join(cache_dir, cache_key) if os.path.exists(cache_path): with open(cache_path, "rb") as f: data = f.read() if expected_hash is None or check_hash(data, expected_hash): return data # the cached file does not match the hash, remove it and re-fetch try: os.remove(cache_path) except OSError: pass contents = read_file(blobpath) if expected_hash and not check_hash(contents, expected_hash): raise ValueError( f"Hash mismatch for data downloaded from {blobpath} (expected {expected_hash}). " f"This may indicate a corrupted download. Please try again." ) try: os.makedirs(cache_dir, exist_ok=True) tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp" with open(tmp_filename, "wb") as f: f.write(contents) os.rename(tmp_filename, cache_path) except OSError: # don't raise if we can't write to the default cache, e.g. issue #75 if user_specified_cache: raise return contents def data_gym_to_mergeable_bpe_ranks( vocab_bpe_file: str, encoder_json_file: str, vocab_bpe_hash: Optional[str] = None, encoder_json_hash: Optional[str] = None, ) -> dict[bytes, int]: # NB: do not add caching to this function rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "] data_gym_byte_to_byte = {chr(b): b for b in rank_to_intbyte} n = 0 for b in range(2**8): if b not in rank_to_intbyte: rank_to_intbyte.append(b) data_gym_byte_to_byte[chr(2**8 + n)] = b n += 1 assert len(rank_to_intbyte) == 2**8 # vocab_bpe contains the merges along with associated ranks vocab_bpe_contents = read_file_cached(vocab_bpe_file, vocab_bpe_hash).decode() bpe_merges = [tuple(merge_str.split()) for merge_str in vocab_bpe_contents.split("\n")[1:-1]] def decode_data_gym(value: str) -> bytes: return bytes(data_gym_byte_to_byte[b] for b in value) # add the single byte tokens bpe_ranks = {bytes([b]): i for i, b in enumerate(rank_to_intbyte)} # add the merged tokens n = len(bpe_ranks) for first, second in bpe_merges: bpe_ranks[decode_data_gym(first) + decode_data_gym(second)] = n n += 1 # check that the encoder file matches the merges file # this sanity check is important since tiktoken assumes that ranks are ordered the same # as merge priority encoder_json = json.loads(read_file_cached(encoder_json_file, encoder_json_hash)) encoder_json_loaded = {decode_data_gym(k): v for k, v in encoder_json.items()} # drop these two special tokens if present, since they're not mergeable bpe tokens encoder_json_loaded.pop(b"<|endoftext|>", None) encoder_json_loaded.pop(b"<|startoftext|>", None) assert bpe_ranks == encoder_json_loaded return bpe_ranks def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None: try: import blobfile except ImportError as e: raise ImportError( "blobfile is not installed. Please install it by running `pip install blobfile`." ) from e with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f: for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]): f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n") def load_tiktoken_bpe( tiktoken_bpe_file: str, expected_hash: Optional[str] = None ) -> dict[bytes, int]: # NB: do not add caching to this function contents = read_file_cached(tiktoken_bpe_file, expected_hash) return { base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line) } tiktoken-0.6.0/tiktoken/model.py000066400000000000000000000067531456133667000166750ustar00rootroot00000000000000from __future__ import annotations from .core import Encoding from .registry import get_encoding # TODO: these will likely be replaced by an API endpoint MODEL_PREFIX_TO_ENCODING: dict[str, str] = { # chat "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. "gpt-35-turbo-": "cl100k_base", # Azure deployment name # fine-tuned "ft:gpt-4": "cl100k_base", "ft:gpt-3.5-turbo": "cl100k_base", "ft:davinci-002": "cl100k_base", "ft:babbage-002": "cl100k_base", } MODEL_TO_ENCODING: dict[str, str] = { # chat "gpt-4": "cl100k_base", "gpt-3.5-turbo": "cl100k_base", "gpt-3.5": "cl100k_base", # Common shorthand "gpt-35-turbo": "cl100k_base", # Azure deployment name # base "davinci-002": "cl100k_base", "babbage-002": "cl100k_base", # embeddings "text-embedding-ada-002": "cl100k_base", "text-embedding-3-small": "cl100k_base", "text-embedding-3-large": "cl100k_base", # DEPRECATED MODELS # text (DEPRECATED) "text-davinci-003": "p50k_base", "text-davinci-002": "p50k_base", "text-davinci-001": "r50k_base", "text-curie-001": "r50k_base", "text-babbage-001": "r50k_base", "text-ada-001": "r50k_base", "davinci": "r50k_base", "curie": "r50k_base", "babbage": "r50k_base", "ada": "r50k_base", # code (DEPRECATED) "code-davinci-002": "p50k_base", "code-davinci-001": "p50k_base", "code-cushman-002": "p50k_base", "code-cushman-001": "p50k_base", "davinci-codex": "p50k_base", "cushman-codex": "p50k_base", # edit (DEPRECATED) "text-davinci-edit-001": "p50k_edit", "code-davinci-edit-001": "p50k_edit", # old embeddings (DEPRECATED) "text-similarity-davinci-001": "r50k_base", "text-similarity-curie-001": "r50k_base", "text-similarity-babbage-001": "r50k_base", "text-similarity-ada-001": "r50k_base", "text-search-davinci-doc-001": "r50k_base", "text-search-curie-doc-001": "r50k_base", "text-search-babbage-doc-001": "r50k_base", "text-search-ada-doc-001": "r50k_base", "code-search-babbage-code-001": "r50k_base", "code-search-ada-code-001": "r50k_base", # open source "gpt2": "gpt2", "gpt-2": "gpt2", # Maintains consistency with gpt-4 } def encoding_name_for_model(model_name: str) -> str: """Returns the name of the encoding used by a model. Raises a KeyError if the model name is not recognised. """ encoding_name = None if model_name in MODEL_TO_ENCODING: encoding_name = MODEL_TO_ENCODING[model_name] else: # Check if the model matches a known prefix # Prefix matching avoids needing library updates for every model version release # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE) for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items(): if model_name.startswith(model_prefix): return model_encoding_name if encoding_name is None: raise KeyError( f"Could not automatically map {model_name} to a tokeniser. " "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect." ) from None return encoding_name def encoding_for_model(model_name: str) -> Encoding: """Returns the encoding used by a model. Raises a KeyError if the model name is not recognised. """ return get_encoding(encoding_name_for_model(model_name)) tiktoken-0.6.0/tiktoken/py.typed000066400000000000000000000000001456133667000166750ustar00rootroot00000000000000tiktoken-0.6.0/tiktoken/registry.py000066400000000000000000000053731456133667000174420ustar00rootroot00000000000000from __future__ import annotations import functools import importlib import pkgutil import threading from typing import Any, Callable, Optional, Sequence import tiktoken_ext from tiktoken.core import Encoding _lock = threading.RLock() ENCODINGS: dict[str, Encoding] = {} ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None @functools.lru_cache() def _available_plugin_modules() -> Sequence[str]: # tiktoken_ext is a namespace package # submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes # - we use namespace package pattern so `pkgutil.iter_modules` is fast # - it's a separate top-level package because namespace subpackages of non-namespace # packages don't quite do what you want with editable installs mods = [] plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".") for _, mod_name, _ in plugin_mods: mods.append(mod_name) return mods def _find_constructors() -> None: global ENCODING_CONSTRUCTORS with _lock: if ENCODING_CONSTRUCTORS is not None: return ENCODING_CONSTRUCTORS = {} for mod_name in _available_plugin_modules(): mod = importlib.import_module(mod_name) try: constructors = mod.ENCODING_CONSTRUCTORS except AttributeError as e: raise ValueError( f"tiktoken plugin {mod_name} does not define ENCODING_CONSTRUCTORS" ) from e for enc_name, constructor in constructors.items(): if enc_name in ENCODING_CONSTRUCTORS: raise ValueError( f"Duplicate encoding name {enc_name} in tiktoken plugin {mod_name}" ) ENCODING_CONSTRUCTORS[enc_name] = constructor def get_encoding(encoding_name: str) -> Encoding: if encoding_name in ENCODINGS: return ENCODINGS[encoding_name] with _lock: if encoding_name in ENCODINGS: return ENCODINGS[encoding_name] if ENCODING_CONSTRUCTORS is None: _find_constructors() assert ENCODING_CONSTRUCTORS is not None if encoding_name not in ENCODING_CONSTRUCTORS: raise ValueError( f"Unknown encoding {encoding_name}. Plugins found: {_available_plugin_modules()}" ) constructor = ENCODING_CONSTRUCTORS[encoding_name] enc = Encoding(**constructor()) ENCODINGS[encoding_name] = enc return enc def list_encoding_names() -> list[str]: with _lock: if ENCODING_CONSTRUCTORS is None: _find_constructors() assert ENCODING_CONSTRUCTORS is not None return list(ENCODING_CONSTRUCTORS) tiktoken-0.6.0/tiktoken_ext/000077500000000000000000000000001456133667000160705ustar00rootroot00000000000000tiktoken-0.6.0/tiktoken_ext/openai_public.py000066400000000000000000000070241456133667000212560ustar00rootroot00000000000000from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe ENDOFTEXT = "<|endoftext|>" FIM_PREFIX = "<|fim_prefix|>" FIM_MIDDLE = "<|fim_middle|>" FIM_SUFFIX = "<|fim_suffix|>" ENDOFPROMPT = "<|endofprompt|>" def gpt2(): mergeable_ranks = data_gym_to_mergeable_bpe_ranks( vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json", vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5", encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783", ) return { "name": "gpt2", "explicit_n_vocab": 50257, # The pattern in the original GPT-2 release is: # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" # This is equivalent, but executes faster: "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", "mergeable_ranks": mergeable_ranks, "special_tokens": {ENDOFTEXT: 50256}, } def r50k_base(): mergeable_ranks = load_tiktoken_bpe( "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken", expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930", ) return { "name": "r50k_base", "explicit_n_vocab": 50257, "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", "mergeable_ranks": mergeable_ranks, "special_tokens": {ENDOFTEXT: 50256}, } def p50k_base(): mergeable_ranks = load_tiktoken_bpe( "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069", ) return { "name": "p50k_base", "explicit_n_vocab": 50281, "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", "mergeable_ranks": mergeable_ranks, "special_tokens": {ENDOFTEXT: 50256}, } def p50k_edit(): mergeable_ranks = load_tiktoken_bpe( "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069", ) special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283} return { "name": "p50k_edit", "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", "mergeable_ranks": mergeable_ranks, "special_tokens": special_tokens, } def cl100k_base(): mergeable_ranks = load_tiktoken_bpe( "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7", ) special_tokens = { ENDOFTEXT: 100257, FIM_PREFIX: 100258, FIM_MIDDLE: 100259, FIM_SUFFIX: 100260, ENDOFPROMPT: 100276, } return { "name": "cl100k_base", "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""", "mergeable_ranks": mergeable_ranks, "special_tokens": special_tokens, } ENCODING_CONSTRUCTORS = { "gpt2": gpt2, "r50k_base": r50k_base, "p50k_base": p50k_base, "p50k_edit": p50k_edit, "cl100k_base": cl100k_base, }