pax_global_header00006660000000000000000000000064152037305430014514gustar00rootroot0000000000000052 comment=77e07c9c19b89266632b47a4321b33f2106135bb snowball-3.1.0/000077500000000000000000000000001520373054300133365ustar00rootroot00000000000000snowball-3.1.0/.github/000077500000000000000000000000001520373054300146765ustar00rootroot00000000000000snowball-3.1.0/.github/workflows/000077500000000000000000000000001520373054300167335ustar00rootroot00000000000000snowball-3.1.0/.github/workflows/ci.yml000066400000000000000000000234771520373054300200660ustar00rootroot00000000000000name: CI # Use bash by default on all platforms. defaults: run: shell: bash on: push: paths-ignore: - '*.rst' - NEWS pull_request: branches: master paths-ignore: - '*.rst' - NEWS workflow_call: inputs: runtime-tests: # We reuse this workflow from runtime-tests.yml. type: boolean # Allows you to run this workflow manually from the Actions tab workflow_dispatch: jobs: build: strategy: matrix: include: - name: "C distribution build" CFLAGS_DIST_BUILD: '-O2 -Wall -W -std=c90 -pedantic -Wmissing-prototypes -Wmissing-declarations -Wshadow -Werror' - name: "C distribution build (clang)" CFLAGS_DIST_BUILD: '-O2 -Wall -W -std=c90 -pedantic -Wmissing-prototypes -Wmissing-declarations -Wshadow -Werror' CC: clang - name: "C" c_tests: y WERROR: '-std=c99 -pedantic -Werror' - name: "C (clang)" c_tests: y WERROR: '-std=c99 -pedantic -Werror' CC: clang - name: java JAVA: java JAVAC: javac JAVACFLAGS: '-Xlint:all -Werror' - name: dart DART: dart os: 'ubuntu-24.04' dart_sdk: stable - name: go_old os: 'ubuntu-22.04' apt_packages: 'golang-1.13' GO: go - name: go_new os: 'ubuntu-24.04' apt_packages: 'golang-1.22' GO: go - name: javascript_node JSRUN: node apt_packages: 'nodejs' - name: javascript_deno JSRUN: 'deno --allow-read --allow-write' apt_packages: 'nodejs' - name: rust RUST: rust apt_packages: 'rustc' - name: zig ZIG: zig - name: csharp MCS: mcs apt_packages: 'mono-devel' - name: Pascal FPC: fpc apt_packages: 'fpc' - name: PHP 8.5 PHP: php apt_extra_repo: ppa:ondrej/php apt_packages: 'php8.5-cli php8.5-mbstring' - name: PHP 8.4 PHP: php apt_extra_repo: ppa:ondrej/php apt_packages: 'php8.4-cli php8.4-mbstring' - name: PHP 8.3 PHP: php os: 'ubuntu-24.04' apt_packages: 'php8.3-cli php8.3-mbstring' - name: Python 3.8 PYTHON_VERSION: 3.8 os: 'ubuntu-22.04' # The pure Python versions run slowly - when we used travis for CI # we used to need to thin the testdata for languages such as Arabic # where there's a lot to avoid the build exceeding the maximum time # allowed for a CI job. GHA allows jobs to take up to 6 hours so # we should no longer need to do this. THIN_FACTOR: 1 - name: Python 3.10 PYTHON: python3.10 os: 'ubuntu-22.04' apt_packages: 'python3.10' THIN_FACTOR: 1 - name: Python 3.12 PYTHON: python3.12 os: 'ubuntu-24.04' apt_packages: 'python3.12' THIN_FACTOR: 1 - name: Python (pypy3) PYTHON: pypy3 apt_packages: 'pypy3' - name: Ada gprbuild: gprbuild apt_packages: 'gnat gprbuild' - name: Windows (C) os: windows-latest c_tests: y ccache: sccache - name: Windows (Go) os: windows-latest GO: go MAKE: mingw32-make mingw64_packages: 'mingw-w64-ucrt-x86_64-go' ccache: sccache fail-fast: false runs-on: ${{ matrix.os || 'ubuntu-latest' }} env: CC: ${{ matrix.CC || 'gcc' }} MAKE: ${{ matrix.MAKE || 'make' }} --output-sync --jobs=4 STEMMING_DATA: 'snowball-data' WERROR: ${{ matrix.WERROR }} steps: - name: Checkout uses: actions/checkout@v5 with: show-progress: false - name: Checkout data if: ${{ ! inputs.runtime-tests }} run: | # Try to check out a branch of the same name from the snowball-data # repo sibling of this snowball repo, so that PRs requiring changes to # both can be CI tested easily. # # For a PR, GHA will have merged the PR branch into upstream master so # we need to similarly merge the snowball-data branch into upstream # master of the snowball-data repo as there may be changes there # required by snowball master. # # If there's no such branch (or repo) we just use the standard # snowball-data repo's default branch. If there is such a branch but # the merge fails, we treat that as a fatal error. UPSTREAM_REPO_URL=https://github.com/snowballstem/snowball-data.git if [ -n "$GITHUB_HEAD_REF" ] ; then # Pull-request. GH_BRANCH=${GITHUB_HEAD_REF} GH_REPO_OWNER=${GITHUB_ACTOR} GH_REPO_URL=https://github.com/$GH_REPO_OWNER/snowball-data.git git clone "$UPSTREAM_REPO_URL" cd snowball-data git remote add pr "$GH_REPO_URL" git config --global user.email "ci@example.org" git config --global user.name "CI" echo "Trying branch $GH_BRANCH from $GH_REPO_URL" if git fetch pr && git branch --track "$GH_BRANCH" pr/"$GH_BRANCH" ; then git merge "$GH_BRANCH" else echo "Falling back to $UPSTREAM_REPO_URL" fi else # Push. GH_BRANCH=${GITHUB_REF_NAME} GH_REPO_OWNER=${GITHUB_REPOSITORY_OWNER} GH_REPO_URL=https://github.com/$GH_REPO_OWNER/snowball-data.git echo "Trying branch $GH_BRANCH from $GH_REPO_URL" if ! git clone -b "$GH_BRANCH" "$GH_REPO_URL" ; then echo "Falling back to $UPSTREAM_REPO_URL" git clone "$UPSTREAM_REPO_URL" fi fi - name: Hook up runtime testing if: ${{ inputs.runtime-tests }} run: | make setup_runtime_tests - name: Install CCache uses: hendrikmuhs/ccache-action@v1.2.21 with: key: ${{ matrix.name }} variant: ${{ matrix.ccache || 'ccache' }} - name: Enable extra Ubuntu package repo if: matrix.apt_extra_repo run: | sudo apt-get -qq remove "php*-cli" "php*-dev" sudo add-apt-repository -y ${{ matrix.apt_extra_repo }} sudo apt-get update - name: Install Ubuntu packages if: matrix.apt_packages run: | sudo apt-get update sudo apt-get install -y ${{ matrix.apt_packages }} - uses: denoland/setup-deno@v2 if: ${{ startsWith(matrix.JSRUN, 'deno') }} with: deno-version: vx.x.x - name: Install Dart SDK if: matrix.dart_sdk uses: dart-lang/setup-dart@v1.7.2 with: sdk: ${{ matrix.dart_sdk }} - name: Install Zig if: matrix.ZIG uses: mlugg/setup-zig@v2 with: version: 0.16.0 - name: Install mingw64 packages if: matrix.mingw64_packages uses: msys2/setup-msys2@v2 with: msystem: ucrt64 install: base-devel ${{ matrix.mingw64_packages }} - name: Build run: $MAKE CC="${{ matrix.ccache || 'ccache' }} $CC" - name: Test C dist if: matrix.CFLAGS_DIST_BUILD run: | pip install setuptools build $MAKE dist mkdir tmp cd tmp tar xf ../dist/libstemmer_c-*.tar.gz cd libstemmer_c-* $MAKE CFLAGS="${{ matrix.CFLAGS_DIST_BUILD }}" cd ../.. rm -rf tmp - name: Test C if: matrix.c_tests run: $MAKE check CC="$CC" - uses: actions/setup-python@v6 with: python-version: ${{ matrix.PYTHON_VERSION }} if: matrix.PYTHON_VERSION - name: Test Python if: matrix.PYTHON || matrix.PYTHON_VERSION run: $MAKE check_python python="${{ matrix.PYTHON || 'python' }}" THIN_FACTOR="${{ matrix.THIN_FACTOR }}" - name: Test Java if: matrix.JAVA && matrix.JAVAC run: $MAKE check_java JAVA="${{ matrix.JAVA }}" JAVAC="${{ matrix.JAVAC }}" JAVACFLAGS="${{ matrix.JAVACFLAGS }}" - name: Test Dart if: matrix.DART run: $MAKE check_dart DART="${{ matrix.DART }}" - name: Test C# if: matrix.MCS run: $MAKE check_csharp MCS="${{ matrix.MCS }}" - name: Test Javascript if: matrix.JSRUN run: $MAKE check_js JSRUN="${{ matrix.JSRUN }}" - name: Lint and check Javascript code if: ${{ startsWith(matrix.JSRUN, 'deno') }} run: | deno lint javascript/*.js js_out/*.js deno check javascript/*.js js_out/*.js - name: Test Rust if: matrix.RUST run: $MAKE check_rust RUST="${{ matrix.RUST }}" - name: Test Go if: matrix.GO run: | go mod init github.com/snowballstem/snowball $MAKE check_go GO="${{ matrix.GO }}" rm -f go.mod - name: Test Pascal if: matrix.FPC run: $MAKE check_pascal FPC="${{ matrix.FPC }}" - name: Test PHP if: matrix.PHP run: | ${{ matrix.PHP }} --version $MAKE check_php PHP="${{ matrix.PHP }}" - name: Test Zig if: matrix.ZIG run: | $MAKE check_zig zig="${{ matrix.ZIG }}" rm -rf "$ZIG_LOCAL_CACHE_DIR" - name: Test Ada if: matrix.gprbuild run: $MAKE check_ada gprbuild="${{ matrix.gprbuild }}" - name: make clean run: | rm -rf '${{ env.STEMMING_DATA }}' # `.ccache`/`.sccache` is specific to CI. We can't just delete it here # or else ccache-action won't save the cache. git status --porcelain=v1|grep -v '^?? \.s\?ccache/$'|grep '^' && { echo 'The generated files listed above are not in .gitignore' ; exit 1; }; true $MAKE clean ${{ case(inputs.runtime-tests == true, 'clean_runtime_tests', '') }} git status --porcelain=v1 --ignored|grep -v '^?? \.s\?ccache/$'|grep '^' && { echo 'The generated files listed above were not removed by `make clean`' ; exit 1; }; true snowball-3.1.0/.github/workflows/runtime-tests.yml000066400000000000000000000006371520373054300223070ustar00rootroot00000000000000name: Runtime tests # Use bash by default on all platforms. defaults: run: shell: bash on: push: paths-ignore: - '*.rst' - NEWS pull_request: branches: master paths-ignore: - '*.rst' - NEWS # Allows you to run this workflow manually from the Actions tab workflow_dispatch: jobs: build: uses: ./.github/workflows/ci.yml with: runtime-tests: true snowball-3.1.0/.gitignore000066400000000000000000000014621520373054300153310ustar00rootroot00000000000000.DS_Store *.exe *.o # Created by `setup_runtime_tests`: /overrides.mk /tmp_runtime_tests_snowball_data/ # /algorithms.mk /snowball /stemtest /dist # Ada /ada/algorithms/ /ada/bin/ /ada/obj/ # C /libstemmer/libstemmer.c /libstemmer/libstemmer_utf8.c /libstemmer/mkinc.mak /libstemmer/mkinc_utf8.mak /libstemmer/modules.h /libstemmer/modules_utf8.h /libstemmer.a /src_c /stemwords # C# /csharp_stemwords /csharp/Snowball/Algorithms/*.generated.cs # Dart /dart/lib/ext/ /dart/lib/src/algorithms.dart # Go /go/algorithms/ /go/stemwords/algorithms.go # Java /java/org/tartarus/snowball/ext/ /java/org/tartarus/snowball/*.class # JS /js_out # PHP /php_out # Python /python_check /python_out # Rust /rust/Cargo.lock /rust/src/snowball/algorithms/*.rs /rust/target/ # Zig /zig/*_stemmer.zig /zig/algorithms.zig /zig/stemwords snowball-3.1.0/AUTHORS000066400000000000000000000013171520373054300144100ustar00rootroot00000000000000Authors ======= Martin Porter ------------- - Designed the snowball language. - Implemented the snowball to C compiler. - Implemented the stemming algorithms in C. - Wrote the documentation. Richard Boulton --------------- - Implemented Java backend of the snowball compiler. - Developed build system. - Assisted with website maintenance. Assistance from --------------- Olivier Bornet - fixes to java packaging and build system. Andreas Jung - useful bug reports on the libstemmer library. Olly Betts - several patches, bug reports, and performance improvements. Sebastiano Vigna and Oerd Cukalla - patches for the Java stemming algorithms. Ralf Junker - fix a potential memory leak in sb_stemmer_new(). snowball-3.1.0/CONTRIBUTING.rst000066400000000000000000000300571520373054300160040ustar00rootroot00000000000000General contribution guidelines =============================== We don't have a formally defined coding style guide, but please strive to make new/changed code look like the code around it. Use spaces-only for indentation except where there's a syntax reason (e.g. ``GNUmakefile``) or a strong convention (e.g. Go's standard seems to be tabs, and ``gofmt`` reindents code using tabs). Avoid adding trailing whitespace on lines. Make sure there's a newline character at the end of new text files. Avoid mixing code reformatting changes with functional changes - doing so makes it harder to review patches. Adding a new stemming algorithm =============================== To add a new stemming algorithm you need to submit PRs against three repositories. See below for details of what's needed in each of these. Name the branch the same for at least `snowball` and `snowball-data` and push to `snowball-data` first, then the CI should use your new vocabulary list when running the testsuite. snowball repo ------------- This is where the implementation of the new algorithm goes. Add the `.sbl` source implementing it to the `algorithms/` subdirectory. Add entry to `libstemmer/modules.txt`, maintaining the current sorted order by the first column. The columns are: * Algorithm name (needs to match the `.sbl` source without extension) * Encodings to support. Wide-character Unicode is always supported and doesn't need to be listed here. You should always include `UTF_8`, and also any of `ISO_8859_1`, `ISO_8859_2` and `KOI8_R` which the language can usefully be written using only characters from (in particular they need to contain all the characters the stemmer explicitly uses). Support for other single-byte character sets is easy to add if they're useful. * Names and ISO-639 codes for the language. Wikipedia has a handy list of `all the ISO-639 codes `_ - find the row for your new language and include the codes from the "639-1", "639-2/T" and (if different) "639-2/B" columns. For example, for the `Afar` language you'd put `afar,aa,aar` here. Some points to note about algorithm implementations: * Avoid literal non-ASCII characters in snowball string literals - they will work OK for languages that use UTF-8, but not wide-character Unicode or other encodings. Instead use ``stringdef`` like the existing stemmers do, and please use the newer `U+` notation rather than the older ``hex`` or ``decimal`` as this allows us to support different encodings without having to modify the source files - for example:: stringdef o" {U+00F6} define foo 'o{o"}' not:: stringdef o" hex F6 define foo 'o{o"}' and definitely not:: define foo 'oö' It's OK to use UTF-8 in comments. * It's helpful to consistently use the same ``stringdef`` codes across the different stemmers - for languages using the latin alphabet our website has `guidance on what to use `_ and a `list of stringdef lines for common characters to cut and paste from `_. snowball-data repo ------------------ Add subdirectory named after new stemmer containing: * voc.txt - word list * output.txt - stemmed equivalents * COPYING - licensing details (word lists need to be under an OSI-approved licence) If you don't have access to a suitably licensed word list of a suitable size, you may be able to use the `wikipedia-most-common-words` script to generate one by extracting the most frequent words from a Wikipedia dump in the language the stemmer is for. You need to specify the Unicode "script" (that's "script" in the sense of alphabet) to use - you can find the appropriate one by looking in the Unicode `Scripts.txt `_. The script name is the second column, between `;` and `#`. The first entries are all "Common" which isn't what you want - scroll down to get to the entries that are useful here. You also need to specify the minimum frequency to select. Picking this value will probably need some experimentation as the appropriate threshold depends on how much data there is in the wikipedia dump for a particular language, as well as the size of the vocabulary for the language, and how inflected the language is. Try counting the number of unique words extracted (`wc -l voc.txt` on Unix) and also looking through the list - some proper nouns, words from other languages, typos, etc are OK (since the stemmer will encounter all these in practice too), but at some point "more" stops being "better". snowball-website repo --------------------- This is where a description of the new algorithm goes. Experience from maintaining Snowball for many years has shown us that the most important points to cover are **WHY** particular things are done or are not done. For example, if a particular ending isn't removed because doing so causes problems in other cases it's really helpful to have that recorded. Then if years later we get a bug report because this ending isn't removed we can easily answer, and don't have to try to contact you and hope you can remember, or try to work out why for ourselves. The original set of Snowball stemmers each have an English prose description of the algorithm which focuses on **WHAT** the algorithm does. These might be helpful if you want to implement the algorithm from scratch in a separate language, but they've not proved very useful for maintaining the Snowball implementations - if the prose and Snowball code disagree we know something is wrong, but it's hard to know which is right! Therefore we recommend to let the Snowball implementation describe what the algorithm does, and only comment on "**WHAT**" in cases where the implementation needs explanation to help the reader understand it. If your algorithm is based on an academic paper, cite the paper and describe any differences between your implementation and that described in the paper. For example, sometimes papers have ambiguities that need resolving to re-implement the algorithm described - see the `Hindi `_ and `Indonesian `_ stemming algorithms descriptions for examples. The mechanics of adding the algorithm description are: * Create subdirectory of `algorithms/` named after the language. * Create `stemmer.tt` which describes the stemming algorithm. This is a "template toolkit" template which is essentially a mix of HTML and some macros for adding the navigation, sample vocabulary, etc. See the existing `stemmer.tt` files for other algorithms for how to use these macros. * If you have a stopword list, add that as `stop.txt` in your new subdirectory. The `generate` script checks if such a file exists and if it does a link to it is automatically added. * Link to your new `stemmer.tt` from `algorithms/index.tt`. * Add a news entry to `index.tt`. * Add the new stemmer to the online demo. Assuming you have checkouts of the `snowball`, `snowball-data` and `snowball-website` repos in sibling directories: * run `make check_js` in the `snowball` repo * run `./update-js` * add the new stemmer to git with: `git add js/*-stemmer.js` * if the new language is written right-to-left (RTL) then add it to the check in `demo.tt` (search for `rtl` to find the place to change.) * `git commit`. Adding a new programming language generator =========================================== This is a short guide to adding support for generating code for another programming language. Is a new generator the right solution? -------------------------------------- Adding a new code generator is probably not your only option if you want to use Snowball from another language - most languages have support for writing bindings to a C library, so this is probably another option. Generating code can have advantages. For example, it can be simpler to deploy without C bindings which need to be built for a specific platform. However, it's likely to be significantly more work to implement a new generator than to write bindings to the generated C code, especially as the libstemmer C API is a very small and simple one. Generated code can also be slower - currently the Snowball compiler often generates code that assumes an optimising compiler will clean up redundant constructs, which is not a problem for C, and probably not for most compiled languages, but for a language like Python C bindings are much faster than the generated Python code (using pypy helps a lot, but is still slower). See doc/libstemmer_python_README for some timings. That said, the unoptimised generated code has improved over time, and is likely to improve further in the future. Key problems to solve --------------------- * You need to work out how to map the required flow of control in response to Snowball signals. In the generated C code this is mostly done using `goto`. If your language doesn't provide an equivalent to `goto` then you'll need an alternative solution. In Java and JavaScript we use labelled `break` from blocks and loops instead. If your language has an equivalent to this feature, that will probably work. For Python, we currently generate a `try:` ... `raise lab123` ... `except lab123: pass` construct. This works, but doesn't seem ideal. If one of the mechanisms above sounds suitable then take a look at the generator for the respective generated output and generator code. If not, come and talk to us on the snowball-discuss mailing list. * Snowball's division is specified as integer division with semantics matching C - i.e. the result should be truncated (rounded towards zero). Some languages lack a built-in integer division operation, or have one which instead implements rounding towards negative infinity. Existing backends with special handling here which may be useful to look at include Javascript, Pascal and Python. * Although all the stemmers we currently ship define a single Snowball `external` named `stem`, this should not be assumed. Also externals should be callable from Snowball code as well as externally (see the Go and Rust generators for examples of how to handle this - if an external is called from Snowball code then it is generated as a routine, with a shim external which just tail calls to this routine). Don't hardcode algorithm names ------------------------------ We want to avoid hard-coded lists of algorithms in the language-specific code that have to be manually updated each time a new algorithm is added, because that adds some extra tedious work for adding a new algorithm, and mechanical updates done by hand tend to miss places that need updating, or code gets copied and pasted from an existing case but not fully updated. All the existing language backends generate any such code at build time, and adding a new algorithm just requires updating `libstemmer/modules.txt`. You can probably copy the approach used for Pascal (script `pascal/generate.pl` works from template `stemwords-template.dpr` which has marked blocks of code that get expanded for each stemming algorithm with a placeholder replaced by the algorithm name. For an alternative approach, see Rust where this is done by `rust/build.rs`. Mechanics of adding a new generator ----------------------------------- Copy an existing `compiler/generator_*.c` for your new language and modify away (`generator.c` has the generator for C, but also some common functions so if you start from this one you'll need to remove those common functions). Please resist reformatting existing C code - there's currently a lot of code repeated in each generator which ought to be pulled out as common code, and if you reformat that just makes that job harder. Add your new source to `COMPILER_SOURCES` in `GNUmakefile`. Add prototypes for the new functions to `compiler/header.h`. Add support to `compiler/driver.c`. Add targets to `GNUmakefile` to run tests for the new language. Hook up automated testing via CI in `.github/workflows/ci.yml`. Add to the list of languages in `README.rst`. snowball-3.1.0/COPYING000066400000000000000000000031361520373054300143740ustar00rootroot00000000000000Copyright (c) 2001, Dr Martin Porter Copyright (c) 2004,2005, Richard Boulton Copyright (c) 2013, Yoshiki Shibukawa Copyright (c) 2006-2025, Olly Betts All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Snowball project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. snowball-3.1.0/GNUmakefile000066400000000000000000001144731520373054300154220ustar00rootroot00000000000000# -*- makefile -*- # After changing this, run `make update_version` to update various sources # which hard-code it. SNOWBALL_VERSION = 3.1.0 ifeq ($(OS),Windows_NT) EXEEXT = .exe endif # `make SAVETMP=1` to save stemwords output for UTF-8 C stemmers on failure. # Intended for use with snowball-data's stemmer-compare. ifneq '$(SAVETMP)' '' .NOTPARALLEL: TEE_TO_TMP_TXT:=tee tmp.txt| CLEAN_TMP_TXT:=rm -f tmp.txt endif # Use to hook up runtime tests (see `setup_runtime_tests` target below). -include overrides.mk # `make SNOWBALL_FLAGS=-comments` to generate target language code with # comments indicating the corresponding lines in the .sbl source. SNOWBALL_FLAGS ?= SNOWBALL_COMPILE := ./snowball $(SNOWBALL_FLAGS) # Ada gprbuild ?= gprbuild ada_src_main_dir = ada ada_src_dir = $(ada_src_main_dir)/algorithms # C ARFLAGS = -cr c_src_dir = src_c # C# MONO ?= mono MCS ?= mcs csharp_src_main_dir = csharp/Snowball csharp_src_dir = $(csharp_src_main_dir)/Algorithms csharp_sample_dir = csharp/Stemwords # Dart DART ?= dart DART_RUN_FLAGS ?= --enable-asserts dart_src_main_dir = dart/lib dart_src_dir = $(dart_src_main_dir)/ext dart_runtime_dir = dart/lib/src dart_gen_dir = dart/lib/gen dart_example_dir = dart/example dart_package_dir = dart # Go go ?= go goflags ?= stemwords/algorithms.go stemwords/main.go gofmt ?= gofmt go_src_main_dir = go go_src_dir = $(go_src_main_dir)/algorithms # Java JAVACFLAGS ?= JAVAC ?= javac JAVA ?= java -ea java_src_main_dir = java/org/tartarus/snowball java_src_dir = $(java_src_main_dir)/ext # Javascript js_output_dir = js_out js_runtime_dir = javascript js_sample_dir = sample JSRUN ?= node JSTYPE ?= global # Pascal FPC ?= fpc # Enable warnings, info, notes; select "FILE:LINE:" diagnostic format. FPC_FLAGS ?= -veiwnr pascal_src_dir = pascal # PHP php_output_dir = php_out php_runtime_dir = php PHP ?= php # Python python ?= python3 python_output_dir = python_out python_runtime_dir = snowballstemmer python_sample_dir = sample # Rust cargo ?= cargo cargoflags ?= --release rust_src_main_dir = rust/src rust_src_dir = $(rust_src_main_dir)/snowball/algorithms # Zig zig ?= zig zig_src_dir = zig DIFF = diff ifeq ($(OS),Windows_NT) DIFF = diff --strip-trailing-cr endif # If iconv isn't installed you can use iconv.py instead via: # # make check ICONV='python iconv.py' ICONV ?= iconv # Where the data files are located - assumes their repo is checked out as # a sibling to this one. STEMMING_DATA ?= ../snowball-data STEMMING_DATA_ABS := $(abspath $(STEMMING_DATA)) # Keep one in $(THIN_FACTOR) entries from gzipped vocabularies. THIN_FACTOR ?= 3 ifneq (1,$(THIN_FACTOR)) ifneq (,$(THIN_FACTOR)) # Command to thin out the testdata. Used for Python tests, which otherwise # take a long time (unless you use pypy). THIN_TEST_DATA := |awk '(FNR % $(THIN_FACTOR) == 0){print}' endif endif tarball_ext = .tar.gz ALGORITHMS ?= algorithms MODULES ?= libstemmer/modules.txt # algorithms.mk is generated from the file $(MODULES) and defines: # * libstemmer_algorithms # * ISO_8859_1_algorithms # * ISO_8859_2_algorithms # * KOI8_R_algorithms include algorithms.mk other_algorithms ?= lovins all_algorithms = $(libstemmer_algorithms) $(other_algorithms) COMPILER_SOURCES = compiler/analyser.c \ compiler/driver.c \ compiler/generator.c \ compiler/generator_ada.c \ compiler/generator_c.c \ compiler/generator_csharp.c \ compiler/generator_dart.c \ compiler/generator_go.c \ compiler/generator_java.c \ compiler/generator_js.c \ compiler/generator_pascal.c \ compiler/generator_php.c \ compiler/generator_python.c \ compiler/generator_rust.c \ compiler/generator_zig.c \ compiler/space.c \ compiler/tokeniser.c COMPILER_HEADERS = compiler/header.h \ compiler/tokens.h # C RUNTIME_SOURCES = runtime/api.c \ runtime/utilities.c RUNTIME_HEADERS = runtime/api.h \ runtime/snowball_runtime.h LIBSTEMMER_SOURCES = libstemmer/libstemmer.c LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h LIBSTEMMER_EXTRA = $(MODULES) libstemmer/libstemmer_c.in STEMWORDS_SOURCES = examples/stemwords.c STEMTEST_SOURCES = tests/stemtest.c # C# CSHARP_RUNTIME_SOURCES = csharp/Snowball/Among.cs \ csharp/Snowball/Stemmer.cs \ csharp/Snowball/AssemblyInfo.cs CSHARP_STEMWORDS_SOURCES = csharp/Stemwords/Program.cs # Dart DART_RUNTIME_SOURCES = dart/lib/src/snowball.dart \ dart/lib/src/algorithms.dart DART_PACKAGE_SOURCES = dart/lib/snowball.dart DART_TEST_SOURCES = dart/example/test_app.dart DART_PACKAGE_FILES = dart/pubspec.yaml \ dart/analysis_options.yaml \ dart/.gitignore # Java JAVA_RUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \ java/org/tartarus/snowball/CharArraySequence.java \ java/org/tartarus/snowball/SnowballProgram.java \ java/org/tartarus/snowball/SnowballStemmer.java \ java/org/tartarus/snowball/TestApp.java # Javascript JS_RUNTIME_SOURCES = javascript/base-stemmer.js JS_SAMPLE_SOURCES = javascript/stemwords.js # Pascal PASCAL_RUNTIME_SOURCES = pascal/SnowballProgram.pas PASCAL_STEMWORDS_SOURCES = pascal/stemwords.dpr # PHP PHP_RUNTIME_SOURCES = php/base-stemmer.php # Python PYTHON_RUNTIME_SOURCES = python/snowballstemmer/basestemmer.py \ python/snowballstemmer/among.py PYTHON_SAMPLE_SOURCES = python/testapp.py \ python/stemwords.py PYTHON_PACKAGE_FILES = python/MANIFEST.in \ python/pyproject.toml \ python/setup.py \ python/setup.cfg PYTHON_STEMWORDS_SOURCE = python/stemwords.py COMMON_FILES = COPYING \ NEWS ALL_ALGORITHM_FILES = $(all_algorithms:%=$(ALGORITHMS)/%.sbl) C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \ $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \ $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \ $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c) C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \ $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \ $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \ $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h) C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java) CSHARP_SOURCES = $(libstemmer_algorithms:%=$(csharp_src_dir)/%Stemmer.generated.cs) DART_SOURCES = $(libstemmer_algorithms:%=$(dart_src_dir)/%_stemmer.dart) \ $(dart_runtime_dir)/algorithms.dart PASCAL_SOURCES = $(ISO_8859_1_algorithms:%=$(pascal_src_dir)/%Stemmer.pas) PYTHON_SOURCES = $(libstemmer_algorithms:%=$(python_output_dir)/%_stemmer.py) \ $(python_output_dir)/__init__.py JS_SOURCES = $(libstemmer_algorithms:%=$(js_output_dir)/%-stemmer.js) \ $(js_output_dir)/base-stemmer.js PHP_SOURCES = $(libstemmer_algorithms:%=$(php_output_dir)/%-stemmer.php) \ $(php_output_dir)/base-stemmer.php RUST_SOURCES = $(libstemmer_algorithms:%=$(rust_src_dir)/%_stemmer.rs) ZIG_SOURCES = $(libstemmer_algorithms:%=$(zig_src_dir)/%_stemmer.zig) \ $(zig_src_dir)/algorithms.zig GO_SOURCES = $(libstemmer_algorithms:%=$(go_src_dir)/%_stemmer.go) \ $(go_src_main_dir)/stemwords/algorithms.go ADA_SOURCES = $(libstemmer_algorithms:%=$(ada_src_dir)/stemmer-s_%.ads) \ $(libstemmer_algorithms:%=$(ada_src_dir)/stemmer-s_%.adb) \ $(ada_src_dir)/stemmer-factory.ads $(ada_src_dir)/stemmer-factory.adb COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o) RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o) LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o) LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o) STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o) STEMTEST_OBJECTS=$(STEMTEST_SOURCES:.c=.o) C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o) C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o) JAVA_CLASSES = $(JAVA_SOURCES:.java=.class) JAVA_RUNTIME_CLASSES=$(JAVA_RUNTIME_SOURCES:.java=.class) CFLAGS=-g -O2 -W -Wall -Wcast-qual -Wmissing-prototypes -Wmissing-declarations -Wshadow $(WERROR) CPPFLAGS= INCLUDES=-Iinclude all: snowball$(EXEEXT) libstemmer.a stemwords$(EXEEXT) $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) algorithms.mk: GNUmakefile libstemmer/mkalgorithms.pl $(MODULES) libstemmer/mkalgorithms.pl algorithms.mk $(MODULES) clean: rm -f $(CLEANFILES) rm -rf $(CLEANDIRS) update_version: perl -pi -e '/SNOWBALL_VERSION/ && s/\d+\.\d+\.\d+/$(SNOWBALL_VERSION)/' \ compiler/header.h \ csharp/Snowball/AssemblyInfo.cs \ dart/pubspec.yaml \ python/setup.py perl -pi -e 's/(libstemmer_c-)\d+\.\d+.\d+/$${1}$(SNOWBALL_VERSION)/' README.rst # Generate and build for all target languages. everything: ada all csharp dart go java js pascal python rust zig # Generate code for all languages. Override build tools to do as little code # building as possible. generate: gprbuild=perl -e '$$ARGV[0] eq "-Pgenerate" and unshift @ARGV, "gprbuild" and exec @ARGV' -- generate: mcs=: generate: DART=: generate: go=: generate: JAVAC=: generate: everything # The directories where generated code goes for all languages. ALL_CODE_DIRS := \ ada src_c csharp dart go java js_out pascal python_out rust zig # When runtime tests are enabled, this gets overridden by overrides.mk. BASELINE ?= baseline baseline-create: generate rm -rf *.$(BASELINE) for d in $(ALL_CODE_DIRS) ; do cp -a $$d $$d.$(BASELINE) ; done rm -rf *.$(BASELINE)/*.o ada.$(BASELINE)/obj pascal.$(BASELINE)/*.ppu find java.$(BASELINE) -name '*.class' -delete baseline-diff: @for d in $(ALL_CODE_DIRS) ; do diff -ru -x'*.o' -x'obj' -x'*.ppu' -x'*.class' -x'Cargo.lock' -x'target' $$d.$(BASELINE) $$d ; done .PHONY: all clean update_version everything generate baseline-create baseline-diff $(STEMMING_DATA)/% $(STEMMING_DATA_ABS)/%: @[ -f '$@' ] || { echo '$@: Test data not found'; echo 'Checkout the snowball-data repo as "$(STEMMING_DATA_ABS)"'; exit 1; } snowball$(EXEEXT): $(COMPILER_OBJECTS) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ $(COMPILER_OBJECTS): $(COMPILER_HEADERS) # List of files/glob patterns to remove on clean. This gets appended to by # each target language section. CLEANFILES := $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \ $(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball$(EXEEXT) \ libstemmer.a stemwords$(EXEEXT) \ libstemmer/modules.h \ libstemmer/modules_utf8.h \ stemtest$(EXEEXT) $(STEMTEST_OBJECTS) \ libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \ libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c \ algorithms.mk # List of directories to recursively remove on clean. This gets appended to by # each target language section. CLEANDIRS := dist # Ada ifneq '$(filter grouped-target,$(.FEATURES))' '' # Grouped-targets were added in GNU make 4.3. $(ada_src_dir)/stemmer-s_%.adb $(ada_src_dir)/stemmer-s_%.ads &: $(ALGORITHMS)/%.sbl snowball else # This will fail to recreate the .ads if it is deleted but the corresponding # .adb is still present and up-to-date. That seems better than forcing a # serial build with .NOTPARALLEL which it seems can only be applied to an # entire makefile, not per-rule. $(ada_src_dir)/stemmer-s_%.ads: $(ada_src_dir)/stemmer-s_%.adb @: $(ada_src_dir)/stemmer-s_%.adb: $(ALGORITHMS)/%.sbl snowball endif @mkdir -p $(ada_src_dir) $(SNOWBALL_COMPILE) $< -ada -P 'S_$*' -o $@ # C libstemmer/libstemmer.c: libstemmer/libstemmer_c.in sed 's/@MODULES_H@/modules.h/' $^ >$@ libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@ libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl $(MODULES) libstemmer/mkmodules.pl $@ $(c_src_dir) $(MODULES) libstemmer/mkinc.mak libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl $(MODULES) libstemmer/mkmodules.pl $@ $(c_src_dir) $(MODULES) libstemmer/mkinc_utf8.mak utf8 libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS) libstemmer.a: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS) $(AR) $(ARFLAGS) $@ $^ examples/%.o: examples/%.c $(CC) $(CFLAGS) $(INCLUDES) $(CPPFLAGS) -c -o $@ $< stemwords$(EXEEXT): $(STEMWORDS_OBJECTS) libstemmer.a $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ tests/%.o: tests/%.c $(CC) $(CFLAGS) $(INCLUDES) $(CPPFLAGS) -c -o $@ $< stemtest$(EXEEXT): $(STEMTEST_OBJECTS) libstemmer.a $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ $(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) $(SNOWBALL_COMPILE) $< -o $@ -eprefix $*_UTF_8_ -r ../runtime -u $(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) $(SNOWBALL_COMPILE) charsets/KOI8-R.sbl $< -o $@ -eprefix $*_KOI8_R_ -r ../runtime $(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) $(SNOWBALL_COMPILE) $< -o $@ -eprefix $*_ISO_8859_1_ -r ../runtime $(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) $(SNOWBALL_COMPILE) charsets/ISO-8859-2.sbl $< -o $@ -eprefix $*_ISO_8859_2_ -r ../runtime $(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h $(CC) $(CFLAGS) $(INCLUDES) $(CPPFLAGS) -c -o $@ $< # C# csharp_stemwords$(EXEEXT): $(CSHARP_STEMWORDS_SOURCES) $(CSHARP_RUNTIME_SOURCES) $(CSHARP_SOURCES) $(MCS) -unsafe -target:exe -out:$@ $(CSHARP_STEMWORDS_SOURCES) $(CSHARP_RUNTIME_SOURCES) $(CSHARP_SOURCES) $(csharp_src_dir)/%Stemmer.generated.cs: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(csharp_src_dir) $(SNOWBALL_COMPILE) $< -csharp -o $@ # Dart dart/lib/src/algorithms.dart: dart/generate_algorithms.pl libstemmer/modules.txt dart/generate_algorithms.pl $(libstemmer_algorithms) > $@ $(dart_src_dir)/%_stemmer.dart: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(dart_src_dir) $(SNOWBALL_COMPILE) $< -dart -o $@ -p SnowballStemmer # Go $(go_src_main_dir)/stemwords/algorithms.go: go/stemwords/generate.go $(MODULES) @echo "Generating algorithms.go" @cd go/stemwords && $(go) generate $(go_src_dir)/%_stemmer.go: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(go_src_dir)/$* $(SNOWBALL_COMPILE) $< -go -o "$(go_src_dir)/$*/$*_stemmer" -P $* $(gofmt) -s -w $(go_src_dir)/$*/$*_stemmer.go # Java $(java_src_dir)/%Stemmer.java: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(java_src_dir) $(SNOWBALL_COMPILE) $< -java -o $@ -p org.tartarus.snowball.SnowballStemmer # Javascript $(js_output_dir)/%-stemmer.js: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(js_output_dir) $(SNOWBALL_COMPILE) $< -js -o $@ $(js_output_dir)/base-stemmer.js: $(js_runtime_dir)/base-stemmer.js @mkdir -p $(js_output_dir) cp $< $@ # Pascal pascal/stemwords.dpr: pascal/stemwords-template.dpr $(MODULES) pascal/generate.pl $(ISO_8859_1_algorithms) < pascal/stemwords-template.dpr > $@ pascal/stemwords: $(PASCAL_STEMWORDS_SOURCES) $(PASCAL_RUNTIME_SOURCES) $(PASCAL_SOURCES) $(FPC) $(FPC_FLAGS) -o$@ -Mdelphi $(PASCAL_STEMWORDS_SOURCES) $(pascal_src_dir)/%Stemmer.pas: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(pascal_src_dir) $(SNOWBALL_COMPILE) $< -pascal -o $@ # PHP $(php_output_dir)/%-stemmer.php: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(php_output_dir) $(SNOWBALL_COMPILE) $< -php -o $@ $(php_output_dir)/base-stemmer.php: $(php_runtime_dir)/base-stemmer.php @mkdir -p $(php_output_dir) cp $< $@ # Python $(python_output_dir)/%_stemmer.py: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(python_output_dir) $(SNOWBALL_COMPILE) $< -python -eprefix _ -o $@ $(python_output_dir)/__init__.py: python/create_init.py $(libstemmer_algorithms:%=$(python_output_dir)/%_stemmer.py) $(python) python/create_init.py $(python_output_dir) # Rust $(rust_src_dir)/%_stemmer.rs: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(rust_src_dir) $(SNOWBALL_COMPILE) $< -rust -o $@ # Zig $(zig_src_dir)/%_stemmer.zig: $(ALGORITHMS)/%.sbl snowball$(EXEEXT) @mkdir -p $(zig_src_dir) $(SNOWBALL_COMPILE) $< -zig -o $@ $(zig_src_dir)/algorithms.zig: zig/generate_algorithms.pl libstemmer/modules.txt zig/generate_algorithms.pl $(libstemmer_algorithms) > $@ .PHONY: dist dist_snowball dist_libstemmer_c dist_libstemmer_csharp dist_libstemmer_dart dist_libstemmer_java dist_libstemmer_js dist_libstemmer_python dist_libstemmer_php # Make a full source distribution dist: dist_snowball dist_libstemmer_c dist_libstemmer_csharp dist_libstemmer_dart dist_libstemmer_java dist_libstemmer_js dist_libstemmer_python dist_libstemmer_php # Make a distribution of all the sources involved in snowball dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \ $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(LIBSTEMMER_SOURCES) \ $(LIBSTEMMER_UTF8_SOURCES) \ $(LIBSTEMMER_HEADERS) \ $(LIBSTEMMER_EXTRA) \ $(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) $(STEMTEST_SOURCES) \ $(COMMON_FILES) \ GNUmakefile README.rst doc/TODO libstemmer/mkmodules.pl destname=snowball-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ for file in $^; do \ dir=`dirname $$file` && \ mkdir -p $${dest}/$${dir} && \ cp -a $${file} $${dest}/$${dir} || exit 1 ; \ done && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the C library. dist_libstemmer_c: \ $(RUNTIME_SOURCES) \ $(RUNTIME_HEADERS) \ $(LIBSTEMMER_SOURCES) \ $(LIBSTEMMER_UTF8_SOURCES) \ $(LIBSTEMMER_HEADERS) \ $(LIBSTEMMER_EXTRA) \ $(C_LIB_SOURCES) \ $(C_LIB_HEADERS) \ $(COMMON_FILES) \ libstemmer/mkinc.mak \ libstemmer/mkinc_utf8.mak destname=libstemmer_c-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ cp -a doc/libstemmer_c_README $${dest}/README && \ mkdir -p $${dest}/examples && \ cp -a examples/stemwords.c $${dest}/examples && \ mkdir -p $${dest}/$(c_src_dir) && \ cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \ mkdir -p $${dest}/runtime && \ cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \ mkdir -p $${dest}/libstemmer && \ cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \ mkdir -p $${dest}/include && \ mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \ (cd $${dest} && \ echo "README.rst" >> MANIFEST && \ ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \ ls runtime/*.c runtime/*.h >> MANIFEST && \ ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \ ls include/*.h >> MANIFEST) && \ cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \ cp -a $(COMMON_FILES) $${dest} && \ echo 'include mkinc.mak' >> $${dest}/Makefile && \ echo 'ifeq ($$(OS),Windows_NT)' >> $${dest}/Makefile && \ echo 'EXEEXT=.exe' >> $${dest}/Makefile && \ echo 'endif' >> $${dest}/Makefile && \ echo 'CFLAGS=-O2' >> $${dest}/Makefile && \ echo 'CPPFLAGS=-Iinclude' >> $${dest}/Makefile && \ echo 'ARFLAGS=-cr' >> $${dest}/Makefile && \ echo 'all: libstemmer.a stemwords$$(EXEEXT)' >> $${dest}/Makefile && \ echo 'libstemmer.a: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \ echo ' $$(AR) $(ARFLAGS) $$@ $$^' >> $${dest}/Makefile && \ echo 'stemwords$$(EXEEXT): examples/stemwords.o libstemmer.a' >> $${dest}/Makefile && \ echo ' $$(CC) $$(CFLAGS) -o $$@ $$^' >> $${dest}/Makefile && \ echo 'clean:' >> $${dest}/Makefile && \ echo ' rm -f stemwords$$(EXEEXT) libstemmer.a *.o $(c_src_dir)/*.o examples/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the C# library. dist_libstemmer_csharp: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(COMMON_FILES) \ $(LIBSTEMMER_EXTRA) \ $(CSHARP_SOURCES) destname=libstemmer_csharp-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ cp -a doc/libstemmer_csharp_README $${dest}/README && \ mkdir -p $${dest}/$(csharp_src_dir) && \ cp -a $(CSHARP_SOURCES) $${dest}/$(csharp_src_dir) && \ mkdir -p $${dest}/$(csharp_src_main_dir) && \ cp -a $(CSHARP_RUNTIME_SOURCES) $${dest}/$(csharp_src_main_dir) && \ mkdir -p $${dest}/$(csharp_sample_dir) && \ cp -a $(CSHARP_STEMWORDS_SOURCES) $${dest}/$(csharp_sample_dir) && \ cp -a $(COMMON_FILES) $${dest} && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the Dart library. dist_libstemmer_dart: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(COMMON_FILES) \ $(LIBSTEMMER_EXTRA) \ $(DART_SOURCES) destname=libstemmer_dart-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ mkdir -p $${dest}/$(dart_package_dir) && \ mkdir -p $${dest}/$(dart_src_dir) && \ mkdir -p $${dest}/$(dart_runtime_dir) && \ mkdir -p $${dest}/$(dart_src_main_dir) && \ mkdir -p $${dest}/$(dart_example_dir) && \ cp -a doc/libstemmer_dart_README $${dest}/$(dart_package_dir)/README.md && \ cp -a $(DART_SOURCES) $${dest}/$(dart_src_dir) && \ cp -a $(DART_RUNTIME_SOURCES) $${dest}/$(dart_runtime_dir) && \ cp -a $(DART_PACKAGE_SOURCES) $${dest}/$(dart_src_main_dir) && \ cp -a $(DART_TEST_SOURCES) $${dest}/$(dart_example_dir) && \ cp -a $(DART_PACKAGE_FILES) $${dest}/$(dart_package_dir) && \ cp -a $(COMMON_FILES) $${dest}/$(dart_package_dir) && \ mv $${dest}/$(dart_package_dir)/COPYING $${dest}/$(dart_package_dir)/LICENSE && \ mv $${dest}/$(dart_package_dir)/NEWS $${dest}/$(dart_package_dir)/CHANGELOG.md && \ (cd $${dest} && \ echo "$${dart_src_main_dir}/README.md" >> MANIFEST && \ ls $(dart_src_dir)/*.dart >> MANIFEST && \ ls $(dart_src_main_dir)/*.dart >> MANIFEST && \ ls $(dart_runtime_dir)/*.dart >> MANIFEST) && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the Java library. dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(COMMON_FILES) \ $(LIBSTEMMER_EXTRA) \ $(JAVA_SOURCES) destname=libstemmer_java-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ cp -a doc/libstemmer_java_README $${dest}/README && \ mkdir -p $${dest}/$(java_src_dir) && \ cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \ mkdir -p $${dest}/$(java_src_main_dir) && \ cp -a $(JAVA_RUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \ cp -a $(COMMON_FILES) $${dest} && \ (cd $${dest} && \ echo "README" >> MANIFEST && \ ls $(java_src_dir)/*.java >> MANIFEST && \ ls $(java_src_main_dir)/*.java >> MANIFEST) && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} dist_libstemmer_js: $(JS_SOURCES) $(COMMON_FILES) destname=jsstemmer-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ mkdir -p $${dest}/$(js_runtime_dir) && \ mkdir -p $${dest}/$(js_sample_dir) && \ cp -a doc/libstemmer_js_README $${dest}/README.rst && \ cp -a $(COMMON_FILES) $${dest} && \ cp -a $(JS_RUNTIME_SOURCES) $${dest}/$(js_runtime_dir) && \ cp -a $(JS_SAMPLE_SOURCES) $${dest}/$(js_sample_dir) && \ cp -a $(JS_SOURCES) $${dest}/$(js_runtime_dir) && \ (cd $${dest} && \ ls README.rst $(COMMON_FILES) $(js_runtime_dir)/*.js $(js_sample_dir)/*.js > MANIFEST) && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} dist_libstemmer_php: $(PHP_SOURCES) $(COMMON_FILES) destname=libstemmer_php-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ mkdir -p $${dest}/$(php_runtime_dir) && \ cp -a doc/libstemmer_php_README $${dest}/README.rst && \ cp -a $(COMMON_FILES) $${dest} && \ cp -a $(PHP_RUNTIME_SOURCES) $${dest}/$(php_runtime_dir) && \ cp -a $(PHP_SOURCES) $${dest}/$(php_runtime_dir) && \ (cd $${dest} && \ ls README.rst $(COMMON_FILES) $(php_runtime_dir)/*.php > MANIFEST) && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} dist_libstemmer_python: $(PYTHON_SOURCES) $(COMMON_FILES) destname=snowballstemmer-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ mkdir -p $${dest}/src/$(python_runtime_dir) && \ mkdir -p $${dest}/src/$(python_sample_dir) && \ cp $(MODULES) $${dest} && \ cp doc/libstemmer_python_README $${dest}/README.rst && \ cp -a $(PYTHON_SOURCES) $${dest}/src/$(python_runtime_dir) && \ cp -a $(PYTHON_SAMPLE_SOURCES) $${dest}/src/$(python_sample_dir) && \ cp -a $(PYTHON_RUNTIME_SOURCES) $${dest}/src/$(python_runtime_dir) && \ cp -a $(COMMON_FILES) $(PYTHON_PACKAGE_FILES) $${dest} && \ (cd $${dest} && $(python) -m build && cp dist/*.tar.gz dist/*.whl ..) && \ rm -rf $${dest} ############################################################################### # Ada ############################################################################### .PHONY: ada check_ada do_check_ada ada: ada/bin/stemwords check_ada: ada $(MAKE) do_check_ada do_check_ada: $(libstemmer_algorithms:%=check_ada_%) check_ada_%: $(STEMMING_DATA_ABS)/% @echo "Checking output of $* stemmer for Ada" @cd ada && if test -f '$ tmp.txt; \ gzip -dc '$ $$r/$$d/voc.txt ;\ echo ok > $$r/$$d/output.txt ;\ echo "$$d UTF_8,ISO_8859_1 $$d" >> $$r/modules.txt ;\ done printf '%s:=%s\n' \ ALGORITHMS 'tests/runtime' \ BASELINE 'rbaseline' \ MODULES '$(RUNTIME_DATA_DIR)/modules.txt' \ other_algorithms '' \ SNOWBALL_FLAGS '-comments' \ STEMMING_DATA '$(RUNTIME_DATA_DIR)' \ THIN_FACTOR '' \ > overrides.mk rm -f algorithms.mk $(MAKE) algorithms.mk clean_runtime_tests: rm -rf $(RUNTIME_DATA_DIR) overrides.mk snowball-3.1.0/NEWS000066400000000000000000003423031520373054300140420ustar00rootroot00000000000000Snowball 3.1.0 (2026-05-22) =========================== Compiler changes ---------------- * Bug fixes: + Fix segmentation fault if -syntax is used on a program with no code. + Fix segmentation fault on some assignment syntax errors. + Fix bug introduced in v3.0.0 with conversion of `among` starter. If there were any commands after the among in the same command list then the among itself would get lost. Not triggered by any current algorithms. + Clear name field when removing dead assignments. This is visible in the syntax tree shown when command line option -syntax is used, but probably doesn't affect anything otherwise. * Compiler command-line options: + Using `-` for the Snowball source file is now interpreted as stdin. + Improve comments generated by `-comments` to show more details of the corresponding Snowball code (e.g. variable names, arithmetic expressions, and literal strings). + Add `-coverage` option which enables a code coverage feature. So far this tracks which among strings and functions are exercised, and which grouping characters are exercised. ! + Support `-eprefix` for all target languages. This is easy to do and provides a way to deal with externals which collide with keywords in the target language. Our build system now uses `-eprefix _` for Python to make the `stem` external non-public (it is called by BaseStemmer method `stemWord()`) and we no longer hard-code prefixing Python externals with `_`. + Describe more options in `--help` output. + Sort target language options in `--help` output. + The `-o` option is now optional. If not specified we now write output(s) to the same filename as the first source, but with a different extension (e.g. path/to/english.sbl -> path/to/english.c and path/to/english.h). + The `-o` option can now optionally include an extension so you can now write `-c++ -o path/to/foo.cxx` instead of `-c++ -o path/to/foo`, which can be more convenient (e.g. in `make` rules) and also provides an easy way to specify an alternative extension (for example, `.cxx`, `.cc` and `.cpp` are all extensions commonly used for C++ source code). + Reject `-vprefix` option for target languages which don't support it (it is currently only implemented for C/C++). * Diagnostics: + Clean up and improve error reporting. + Improve line numbers reported for some errors and warnings by using the line number of an appropriate token rather than the current line number of the tokeniser (which is often the line after the command being warned about). + Improve recovery after various errors, trying to resynchronise based on what's more likely, and eliminating some additional irrelevant errors (including reporting the exact same error twice in some situations). + Emit warnings for uses of legacy Snowball language features. + The Snowball manual describes `integers (x)` as a declaration of `x` so we now warn: integer 'x' declared but not used rather than: integer 'x' defined but not used + 3.0.0 added a warning if the body of a `repeat` or `atleast` loop always signals `t` (meaning it will loop forever which is very undesirable for a stemming algorithm) or always signals `f` (meaning it will never loop, which seems unlikely to be what was intended). This warning was added to the C generator, but has been moved to generic code so it is now issued regardless of the current target language. + Improve the wording of the warning if the body of a `repeat` or `atleast` loop always signals 't' to explicitly say this means the loop is infinite. + Improve warning message for unreachable code after `not`. + `$x = x + 1` cleared the initialised status of x (rather than just not setting it) which could lead to bogus warnings that `x` is never initialised. + The compiler no longer exits immediately after reporting a division by zero error in the Snowball code. + We now report a division by zero error for `$x /= 0` (this was meant to be already implemented but wasn't working due to a code typo). + More consistent wording of "is a no-op" warnings. + Warn that `insert ''` and `attach ''` are no-ops (and don't generate code for them). + Warn if a string used to define a grouping repeats characters. There's no reason to do this, so it seems likely to be a typo. + Avoid sometimes reporting "-1 blocks unfreed". * Optimisations: + Speed up processing larger Snowball programs by growing large string buffers exponentially to avoid a huge number of reallocations. For example, this reduced the time to compile serbian.sbl to C by about 80%! + Optimise reading of input file when it is seekable (which it is in typical usage). Non-seekable input files are still supported. + Optimise writing integers when generating code. 72% of integers we write are 0 to 9 and these are now written as a character. Other values are now handled without a temporary buffer, avoiding a copy. This reduced the time to compile serbian.sbl to C by about 8%, for example. + Optimise comparing among actions to find and merge equivalent actions. The comparison function used for this was carefully returning a full order, but actually we only need to know if the actions are equivalent or not which can be tested more efficiently. For example, this reduced the time to compile serbian.sbl to C by about 2%. + We now precompute the possible signals from each command which means this is now done exactly once per command, whereas previously we could end up doing it many times for some commands in some cases. The only functional change should we no longer make a pessimistic assumption if the function call depth reaches 100. This is cleaner but is unlikely to make a difference for any real-world Snowball programs. + Handle possible_signals for string-$ which just passes on signals from its subcommand. This doesn't affect code generation for any algorithms we currently ship. + We now only generate function bodies to a temporary buffer for target languages where we need to. This makes the code a bit clearer and reduces the amount of copying of data so will make the Snowball compiler a little faster. This change produces identical output for all current algorithms. + Tokenisation now decodes symbol tokens using switch statements. We don't know the length of these tokens in advance, so the old approach of binary chop on a sorted list required searching the list multiple times with different possible lengths. Alphabetical tokens are still decoded by binary chop. * Code quality: + Remove unused routines and groupings from the program during the analysis phase, which avoids each generator having to have duplicate code to skip them. + Fix small memory leak if all uses of a name are eliminated. + Always use `snprintf()` instead of `sprintf()`. If the buffer passed was too small we now emit an error rather than quietly using truncated output. + Fix GCC -Wcast-qual warnings in compiler and enable this warning by default. + Switch to using the standard C `bool` type in the code of the compiler. (The generated code still aims to require only C90.) * Other changes: + Provide a simpler way to build a cut-down Snowball compiler. The motivation here was to have a way to more quickly build a smaller Snowball compiler which only targets C. Rather than have a DISABLE_xxx macro for each language, just check if TARGET_C_ONLY is defined, and only turn off the code to actually call the other generators which greatly reduces the amount of conditionalisation required. Generic code generation changes ------------------------------- * Bug fixes: + Fix code generated for `setlimit tomark` for all target languages to restore the limit correctly afterwards. The bug was not triggered by any of the existing stemmers. + We no longer optimise repeat/atleast applied to goto/gopast on a (non) grouping. This optimisation was flawed - it requires that the code in the loop preserves the cursor's value on failure, but the target language helper functions used here don't currently do that (they probably easily could so there's scope to reinstate this optimisation). Looking at the stemmers we ship, this affects the code generated for one loop in indonesian.sbl, but it happens the cursor value is overwritten immediately after this loop anyway. The bug could affect non-shipped Snowball code though so isn't purely latent. This bug was introduced in Snowball 3.0.0. + When generating target language literal strings we now always escape characters which can be problematic when viewing the generated source code. We always escape control characters U+007F to U+009F, non-breaking space U+00A0 (visually identical to a space), and U+0590 and above (as a crude way to avoid literal LTR characters in sources which can result in confusing rendering). + Fix line numbers given to various tokens (the line numbers previously given were at least of lines in the same command or the line after it). These lone numbers can be seem in the target language comments generated when `-comments` is used. + Fix warning and simplification of code when `not` is applied to a command which always signals `t`. Bug introduced in v3.0.0. Fixes #271. + Warn and simplify `not` applied to a command which always signals `f`. * Optimisations: + Add machinery to generate a Snowball variable as a local variable in the target language instead of it being "global" (typically a private class member in the target language). This reduces the amount of state in stemmer objects, and typically reduces the overhead of accessing these variables a little. We now do this for integers and booleans in all target languages, and for strings in target languages where benchmarking seems to show it is faster (Dart, Go, JS, Pascal, PHP, Python). It's done for a Snowball variable which is only used in one routine, that routine doesn't (directly or indirectly) call itself, and the variable is set by any code path which leads to a use of the variable. The mechanism which traces the code paths errs on being too conservative in some cases, but it's good enough for all instances in the code we ship, and is likely to handle the vast majority of real-world cases. We issue an "info" diagnostic to report when a variable which is only used in one routine can't be localised - please report if you see this in real world code and we can try to enhance the code path tracing. + Tail-calling and similar optimisations can now work for non-trivial routines (previously they only worked for routines consisting of a single command and not enclosed in parentheses). + A grouping test at the end of a routine now generates simpler code. + A string test at the end of a routine now generates simpler code. + Optimise testing a boolean (optionally preceded by `not`) when used at the end of a routine. + Optimise an `among` with no commands at the end of a routine. + Generate simpler code for `not` applied to testing a boolean variable. + A `not` only needs to restore the cursor when its subcommand fails, so we now consider whether its subcommand can modify the cursor on failure (rather than whether it can modify the cursor at all). Related to #226. + An `or` only needs to restore the cursor when a subcommand fails, so we now consider whether its subcommand can modify the cursor on failure (rather than whether it can modify the cursor at all). We also now consider each subcommand individually, and only emit the cursor restore for those subcommands which need it. (#226) + Both `and` and `or` only need to restore the cursor between sub-commands so no longer consider if the final subcommand might change the cursor. This makes some small improvements to the generated code for a few of the currently shipped algorithms. (#226) + Handle more commands when checking if the cursor needs restoring - this improves the generated code for tamil.sbl a bit. + Single case amongs are now refactored to eliminate the `among` and so no longer call the among machinery. Sometimes a single case among is the natural way to express a single rule in Snowball code as it can show commonality with rulesets with multiple rules, but it's inefficient to actually generate as an among. Of the stemmers we currently ship, this improves code generation for arabic, estonian, greek and lithuanian. + Avoid unnecessary cursor update in among helpers. We only need to update the cursor on success, but were unconditionally doing so after calling an among function. + Handle more commands in repeat_score(). None of these help code generation for any currently shipped algorithms, but they are valid to optimise here. + Canonicalise `<-''` to `delete`. + Simplify some cases of compound assignment operators when the argument is (or can be simplified to) a constant integer, like we already do for arithmetic expressions. For example, `$x += len '{a"}' - len 'a'` is a no-op when using a fixed-width encoding. + Canonicalise `fail C` to `false` in some cases where `C` has no side-effects. + Removing unreachable code could leave single-entry `and`/`or` nodes which could result in generating target language code with unused variables. These nodes are now replaced with their subnode. + Eliminate `true` below `and`/`false` below `or`. These are unlikely to appear verbatim in real programs, but can be created by optimisations, and also can appear in runtime tests, leading to the generated target language code having unused variables and/or unreachable code. + Canonicalise setmark, atmark and atlimit by converting `setmark x` to `$x = cursor`, `atmark x` to `$(cursor == x)` and `atlimit` to either `$(cursor >= limit)` or `$(cursor <= limit)` (depending on whether we're in backwardmode or not). This means the target language generators have three fewer commands to handle, and also gives us tail-calling of `atlimit` and `atmark x` (there's a tail-callable use of `atlimit` in the Turkish stemmer). + Remerge among actions after optimisation. It seems hard to fully move the code to merge them later, but we can check for actions which have become equivalent to `true` or to other actions after optimisation but before we generate code. * Code quality: + Find Snowball routines which are not reachable by calling an external. We no longer generate code for such routines, nor for variables and groupings which are only used in them, which helps to avoid "unused" warnings in the generated target language code. + If the sub-command of `repeat`/`atleast` always signals `t` or always signals `f` we now prune the rest of the current comamnd list, and simplify the command for always `f` (c_repeat -> c_do; c_atleast -> c_bra). These changes help to avoid generating redundant target language code which can trigger errors or warnings. * Other changes: + `delete` and `<-` now update the slice end (see the "Snowball Language Changes" section). Ada --- * Bug fixes: + Ada variable names are case-insensitive, so if two Snowball names of the same type differed only by case we would generate Ada code with a name collision. We now avoid such collisions by adding a counter after the type code for the second and subsequent names that differ only by case. + Ada stemmer names are now prefixed with `S_` so `or.sbl` now generates stemmer `S_or`, avoiding a name clash with an Ada keyword. + Fix Ada code generated `for setlimit tomark p`. This affected the generated code for the Lithuanian stemmer, but it appears by luck in this case the bug didn't actually affect the stemmer's output for any input. + Fix `setlimit` ... `repeat` bug. The generated Ada code was running the code to recover from a failure inside a `repeat` loop twice due to a missing line of code compared to other generators. In `backwardmode`, the failure code happens to be idempotent so running it twice doesn't cause a problem, but in forwards mode this results in the cursor getting double adjusted if the length of the stem has changed due to insertions, deletions or substitutions. None of the existing algorithms use `setlimit` in forwards mode, so they're unaffected by this bug. Fixes #275. + Fix overcopying in string replacement code. The code to move the tail up/down was copying one byte too many. We're working in a 1024 byte fixed-length string buffer, and the maximum allowed input word is one byte shorter, so it seems this was harmless in practice. + Allow characters <32 and 127 in string literals. + `=S` can no longer result in the slice ends becoming negative and triggering a CONSTRAINT_ERROR (the slice is now specified to be unset after `=S` - see the "Snowball Language Changes" section). + Fix Ada code generated for string-$ which was actually partly Pascal code (the Ada generator was originally based on the Pascal one) and didn't even compile. To fix this, Snowball string variables in Ada the same way as the current string. This means they now take up more space (a fixed 1KB), but a typical Snowball program has either no string variables or just one so the overhead seems acceptable. + Fix matching of an empty string variable. This valid Snowball code would trigger "failed precondition" in Ada: externals (stem) strings (s) define stem as ([] ->s s) + Fix assumption that there's a single external called "stem". + Fix incorrect assumption that an among containing the empty string always matched, even if the empty string had a gating function. This construct is not used by any existing stemmers. * Optimisations: + Avoid calling among helper when the among contains only strings which are one byte long, no among functions are used, and there are no actions. * Code quality: + Fix indentation of generated grouping tables. + Rename Context to Z in runtime code. This now matches variable naming in the generated Ada code (and also the C runtime and generated C code). + Eliminate redundant limit check (the Skip_Utf8 helper also checks the limit). Looking at the history this check is a left-over from when the generated code directly incremented the cursor. + Emit Ada literal strings without redundant empty strings between adjacent escaped bytes. + Generate dummy loop around `or`, which allows us to handle a sub-command succeeding with Ada `exit` rather than `goto`, which seems clearer. + Avoid creating unused labels. This is just a cosmetic improvements - there are no longer mysterious gaps in the numbering of labels in the generated code. + Avoid generating unreachable `exit`. * Other changes: + Implement support for `?` (debug command). The code we generate for this case is gnat-specific, but previously the code generated didn't compile so working with one implementation seems a step forwards. The `?` command can now be used to debug Ada, and someone with actual Ada knowledge can now more easily step in and provide a portable replacement. C/C++ ----- * Bug fixes: + Maintain invariant that the C variable corresponding to a Snowball string variable is non-NULL. Previously we would release and NULL out the entry in some error cases, but elsewhere the code was assuming the value was non-NULL. + Fix invalid code generated for `setlimit`. This doesn't happen for `setlimit tomark` (which is the only way `setlimit` is used in the stemmers we currently ship. Bug introduced in v3.0.0. + Fix codegen for `hop` with constant argument. We were relying on the cursor being restored on failure by the code which handled that failure, but if that code is a repeat or atleast command that it has an optimisation which assumes `hop` won't do this. This means we generated incorrect C code for some cases where `hop` was used inside `repeat` or `atleast`. This doesn't functionally affect any of the stemmers we currently ship. Bug introduced in v2.1.0. + Fix bug in code generated when `-vprefix` is specified, introduced in Snowball 2.1.0. + Fix incorrect assumption that an among containing the empty string always matched, even if the empty string had a gating function. This construct is not used by any existing stemmers. * Optimisations: + Rework how non-localised variables are stored, which eliminates an indirection on every access to such a variable, and also avoids some extra allocations (one if a stemmer has any non-localised integer or boolean variables, and another if the stemmer has any string variables). So it uses a bit less memory, it makes creating and destroying a stemmer faster, and it also makes stemming a bit faster (though only by ~0.1% for the English stemmer on our sample vocabulary). The `-vprefix` option now generates getter functions rather than using macro magic, which means the syntax for accessing Snowball variables from C has changed. + We now maintain the invariant that SN_env's p member is non-NULL, which simplifies the runtime code. + We now have a specialised implementation of the slice_del() runtime helper. Deleting the slice is a fairly common operation, and can be done more simply than via a generic replace_s() with an empty replacement string. This speeds up the English stemmer by about 1% on our test vocabulary. + Avoid calling among helper when the among contains only strings which are one byte long, no among functions are used, and there are no actions. + Only fetch SIZE() in replace_s() if we need it. + Don't return adjustment from replace_s() runtime helper since calculating the adjustment in the one caller where we actually want it is just one integer addition and one integer subtraction, and that turns out to be slightly more efficient as well as simpler. + Move check for negative hop from runtime to generated code. This means we can omit it for hop with a constant argument, which is all uses of hop in the stemmers we currently ship. * Code quality: + The generated header is now included from the generated C/C++ source file (which seems cleaner than the previous approach of generating the same prototypes in the header and source file). + The implementation of among functions has changed. Previously we stored a function pointer in struct among, but that requires relocation when the code is in a dynamic library, which adds load-time overhead and means the among structures can't be put in a read-only section. We now store an integer index instead, and pass in a pointer to a dispatcher function when calling the find_among()/find_among_b() helper which gets called when this index is non-zero. The value of the index is stored in z->af so the dispatcher function can use it. If only one unique function is used in an among, we can just pass this to find_among() as the dispatcher which reduces the overhead for this common case. Profiling with cachegrind suggests this change adds a small overhead to algorithms which use among functions - currently finnish and hindi (and also lovins, but that's really only of academic interest and is not enabled by default). + Avoid long string in C source. C90 only guarantees support for literal strings up to 509 characters. Fixes GCC -Woverlength-strings warning. + Avoid C23 feature in C runtime code, introduced in Snowball 3.0.0. Initialising with empty braces was only standardised in C23 (though seems to be widely supported as an extension). + Fix code generated for `setlimit` to be C90. Bug introduced in v3.0.0, but isn't triggered by any of the stemmers we currently ship. + Fix -Wshadow warning for nested string-$ use. We were generating code using a C variable with the fixed name `failure` - now an integer suffix is appended, and we only emit the variable in cases where the subcommand signal isn't known at compile time. + Generate `do {`...`} while (0)` around `or` code, which allows us to handle a sub-command succeeding with `break;` rather than having to use `goto`, which reduces the number of labels used and makes the generated code a bit easier to follow. + C comments are now generated for `(` and `do` when `-comments` is used. + We now generate `+=` or `-=` for `hop ` (instead of something like `z->c = z->c + 2`). The C compiler should treat both the same, but it arguably makes the generated code a little clearer. * Other changes: + C++: The `-c++` option used to generate exactly the same code as for C, except with extension `.cc` instead of `.c` but now: - C++ classes are generated. - C++ `bool` is used for Snowball booleans. - Loop variables are declared inside `for (`...`)`. - Allocation failures and internal errors (e.g. slice_check() failing) throw a C++ exception - this is a bit simpler and more efficient that the C code approach of returning -1 which then has to be checked for and propagated through the generated code. + Snowball's debug command (`?`) now works out of the box (previously you have to adjust a `#if 0` preprocessor conditional in the runtime code). + Rename `runtime/header.h` (which really seems too generic, and is also easy to confuse with `compiler/header.h`) to `runtime/snowball_runtime.h`. We expect most users will be using the C stemmers through libstemmer and so won't be affected by this. C# -- * Bug fixes: + Fix code generated for `<-s`. This is not used by any of the stemmers we currently ship. Test case based on one from ajroetker in #270. + Fix code generated for string-$. This feature is not used by any of the stemmers we currently ship. + Fix assumption that there's a single external called "stem". * Optimisations: + Use Debug.Assert() in slice_check() runtime helper. Previously the runtime code wrote a diagnostic message and continued if one of these checks failed, but failures should only happen with a Snowball program containing logic errors, or for bugs in the Snowball compiler or its runtime (or possibly in the C# compiler, runtime, OS, hardware, etc). Therefore an assertion seems an appropriate choice, and means the check is not enabled for a production build, which seems more helpful overall. See #242. * Code quality: + Eliminate duplicates from groupings. We currently implement these for C# with a linear string search, and a side-effect of this change is that the grouping string is now sorted, which will affect the time taken to look up different characters in an arbitrary way (none of the Snowball sources seem to try to list characters in frequency order). Really C# should be fixed to use an O(1) lookup like other target languages. + The implementation of among functions has changed. We now store an integer index in the Among class, and pass a dispatcher function to the among helper method. If only one unique function is used in an among, we can just pass this to the helper method as the dispatcher which reduces the overhead for this common case. Crude profiling with `time make check_csharp` suggests this doesn't harm performance (perhaps a little faster, but maybe just within the noise). The main benefit is all Among arrays can now be static, which previously we wasn't possible for those which used among functions (#146). + Remove unused return value from Stemmer.Replace() runtime helper. + Fix inaccurate doc comments on runtime functions. * Other changes: + csharp_stemwords: Speed up output to stdout. + csharp_stemwords: Don't write the chosen stemmer to stdout. This is not really useful information, and breaks sending the stemmed words to stdout because they're preceded by extra output. + csharp_stemwords: Try to open input before output so we don't leave an empty output file behind if we can't open the input file. Go -- * Bug fixes: Go: Fix code generated for non-constant hop A non-constant integer expression has type `int` in the generated Go code, but the hop helpers expected `int32`. For a constant hop this worked because Go integer literals are untyped, so will convert to `int32`. To fix this, the helpers now take `int` instead of `int32`. + Fix code generated if `minint` or `maxint` is used. In this case we were generating `use std::usize;` near the start of the Go code, but that's actually Rust code and a hangover from the Go backend being originally based on the Rust one. + The Go code generated for `->` was incorrectly signalling `f` if the slice was empty. Luckily this case is not exercised by any current algorithms. See #242. + Fix code generated for string-$ (which isn't used by any of the algorithms we currently ship). + A snowball `external` could not previously be called from within the Snowball program. This is allowed by the Snowball language, but none of the shipped stemmers do this, and it's unlikely any stemmer would. Perhaps it's useful if you use Snowball for other string-processing tasks. + Fix handling of `minint` and `maxint` - we were generating some code copied verbatim from the Rust generator for this case which was not valid Go. (These are not used by any of the algorithms we currently ship.) * Optimisations: + Reuse `env` in stemwords which is measurably faster than creating a new one for every word. * Code quality: + Eliminate unnecessary semicolons from generated code. + Fix formatting of generated code. The code gets run through gofmt which was fixing up these issues, but better to generate the code cleanly to start with. The only things which gofmt now changes are that it indents variable names to align in adjacent variable declarations, and a couple of things which are apparently for compatibility with older versions of Go. + Runtime helpers SliceDel() and SliceFrom() always returned true, but the generated code included failure checks in case false was returned. These helpers no longer return anything, and the checks are gone. * Documentation: + Recommend that users reuse an `env` since this is measurably faster than creating a new one for every word. * Other changes: + Remove `-gopackage` option from compiler. Use `-package`/`-P` instead (`-gopackage` has just been an alias for these since Snowball 2.0.0). Java ---- * Bug fixes: + Generate correct Java code for ASCII control chars in string literals. + Fix code generated for string-$. As part of this fix, we now use char[] for string variables as well as the current string, which makes it much simpler to switch to working on a string variable and back. Fixes #252. + Fix assumption that there's a single external called "stem". * Other changes: + The generated Java classes no longer implement Serializable. This support was added in 2016, but in 2026 this approach to serialization in Java is apparently no longer used due to security problems. Fixes #255. Javascript ---------- * Bug fixes: + Fix `->` to work when the slice is empty - previously it incorrectly signalled `f` for this case. Luckily this case is not exercised by any current algorithms (#242) + Generate public functions for all externals. Patch from simlrh (#258). + Fix code generated for string-$ * Optimisations: + Use startsWith()/endsWith() in eq_s()/eq_s_b(). This is quite a bit faster as it avoids slice() creating a temporary string (e.g. measured a reduction of ~17% wallclock time for tamil on the test vocabulary, taking the fastest of 5 runs before and after). + Optimise among when all actions are `<-` with a literal string. We now generate a single call to slice_from() with the argument obtained by indexing into an array of literal strings. This is perhaps faster, albeit not by much, but it definitely results in smaller code, which is helpful for in browser use. See #227. + The substring_i member in the Among class is now an offset from the current index, and now zero in the common case where there's not another string which is a sub-prefix/sub-suffix. We've also swapped the order of elements so we can omit this in the common case when it is zero and there's no among function). This reduces the size of the generated Javascript code (even after minification). Fixes #236. + Change slice_check() to assert its conditions. In C we must not perform string slicing if slice_check() fails because that could result in writing outside of the allocated buffer, but it's not problematic in this way for Javascript, and the situations which slice_check() checks for should only happen with a Snowball program containing logic errors, or for bugs in the Snowball compiler or its runtime (or possibly in the Javascript interpreter, OS, hardware, etc). Therefore assert() seems an appropriate choice. * Code quality: + Convert to using Javascript modules and classes. The way among functions are called has been reworked to allow this, copying the approach now used for C and C# (#234, #240). Patches from Adam Turner and Titus Ng. + Adjust generated code to work with deno, and suppress a few deno warnings which are hard to avoid in generated code. + Avoid generating blocks around failure handling. The failure handle code is always a single statement (and if we ever needed more than a statement for some situation then we could arrange to add a block for just those situations). This significantly reduces the size of the generated JS code. + Always inline code for `=>`. The code is not much longer than the call to a helper function in BaseStemmer. Also in 3.0.0 we deprecated `=>` and nothing we ship contains this command, so removing it from BaseStemmer reduces the total code size a little. + Rename BaseStemmer's internal `cursor` property to `c`. Unfortunately, `cursor` is a DOM property, so Javascript minifiers are cautious about renaming it to avoid breaking code. The name `c` matches the naming we use for C, Ada and Pascal. + Generate smaller code for hop by constant. All current uses of hop in the stemmers we ship have a constant argument, so avoid using a temporary variable in these cases. + Optimise `+=1` to `++`, `-=1` to `--`. These are a byte shorter, and it seems Javascript minifiers don't do this for us because it's not a safe transformation unless the minifier can deduce that the variable can't hold a string. + Improve temporary var naming and use. These variables don't need unique generated names now we're declaring them as `const` which has more sensible scoping rules than `var`. + Generate smaller code for `insert` and string-`=`. In some cases we know we have the value of member variable `this.cursor` in local `const c` so use the latter instead. + Use triple equality for JavaScript. Patch from Adam Turner. + Fix position of grouping type comment which is now placed consistently with other type comments. + Use `a` instead of `among_var` in generated code. This reduces the size of the generated code, which is helpful if a minification step isn't being used. + Consistently cuddle braces in runtime code. The style wasn't entirely consistent before, and cuddling braces matches the generated Javascript code and the Snowball C code. + Generate block around case to bound the scope of `const` and `let` within the case. + Use `let` in README example. + Use `let` consistently in stemwords.js. + Initialise integer Snowball variables - we annotate them as being type "number" so we shouldn't let them have value undefined. Patch from Adam Turner. + Improve/fix typescript annotations in runtime and generated code. + Annotate runtime with @ts-expect-error. It doesn't seem to be possible to express the types fully in some places, but the invariants we require are ensured by the Snowball compiler. Annotating the expected errors allows unexpected type checking errors to be be more easily seen, and they are now fatal is CI. + Use `===` and `!==` in stemwords.js. Patch from Adam Turner. * Other changes: + Make stemmer subclasses anonymous and export them by default. This makes creating a stemmer object easier as you only need to build the filename of the stemmer subclass, and not also its class name. + Adjust interpretation of `-parentclassname` option. We supply the JS snowball runtime so being able to specify a different base class name doesn't seem very useful, so instead interpret this as the name to import the base class as in generated stemmers. It now defaults to just `B` which reduces the size of the generated stemmer code a little (even after running it through most Javascript minification tools). + Improve stemwords.js option parsing. Make `-i` and `-o` optional to match other target language versions of stemwords. Eliminate the check that there are at least 3 command line arguments as we don't require any now. If we encounter an argument we don't understand, we now report it and show the usage message (previously we silently ignored it). We now exit with status 1 if there's a problem parsing the command line. + stemwords.js: Emit help message in one console.log. Patch from Titus Ng (#221). Pascal ------ * Bug fixes: + We were generating invalid Pascal code when tail-calling or calling a routine which always fails. Neither case is currently exercised by any stemmers we ship and generate Pascal code for (the Pascal generator currently only supports iso-8859-1). + Fix code generated for string-$ (which isn't used by any of the algorithms we currently ship). + Fix assumption that there's a single external called "stem". * Code quality: + Merge EqS and EqV runtime functions. We can get the length of a Pascal AnsiString `s` cheaply with `Length(s)` so there isn't a need to pass in the length in the string literal case. + Eliminate `While` in code generated for `repeat`/`atleast`. Pascal lacks `Continue` (at least as a standard feature) and this loop only exists so we can jump back to its start with `continue` in other languages - we have a `Break;` at its end so it doesn't loop in the normal way. In Pascal we generate a label before the loop and use `goto` to continue iterating, so we can get rid of the Pascal loop entirely. + Use `Break` instead of `Goto` in code generated for `go`/`gopast`. + Generate dummy loop around `or` so we can handle a sub-command succeeding with Pascal `Break` rather than `Goto`, which seems clearer. + Avoid generating `Repeat` ... `Until True` dummy loops which are not actually needed. + Fix problem introduced in v3.0.0 with formatting of code generated for `go`/`gopast` applied to a grouping. + Switch to a simpler name mangling system. Pascal variable names are case-insensitive but Snowball names are case-sensitive. We used to address this by encoding the case of letters into a prefix on the name but that can generate long and ugly names in some cases (e.g. integer Foo_Bar -> IUllU_Foo_Bar). We now avoid collisions by adding a counter after the type code for the second and subsequent names that differ only by case (so Foo_Bar is only mangled if there's another integer which differs only by case which is declared before it, and even then just becomes something like I2_Foo_Bar). + Emit Pascal literal strings without redundant empty strings between adjacent escaped bytes. + The -comments option now includes the values of string literals, so has been changed to generate "rest of line" comments (starting `//`) rather than block comments (delimited by `{` ... `}`) so that string literals containing `}` don't need escaping. We were already using `//` comments in the Pascal runtime so this shouldn't harm portability. Python ------ * Bug fixes: + Fix `algorithms()` when forwarding to PyStemmer. It looks like this has never worked as the code has been like this since it was merged, and we were forwarding to a method which PyStemmer doesn't provide and never seems to have provided. + stemwords.py: Make -i and -o optional. The command syntax already suggested they were, but actually we gave an error if they were omitted. + Fix code generated for string-$ (which isn't used by any of the algorithms we currently ship). + Fix `->` to work when the slice is empty - previously it incorrectly signalled `f` for this case. Luckily this case is not exercised by any current algorithms (#242) + Remove deprecated licence classifier which now triggers a deprecation warning from Python's setuptools. We already specify the licensing in the now preferred way via `license=` with a SPDX licence expression. * Optimisations: + Optimise single-character string literal checks in the same way we already do for C. This seems to be measurably faster (tested with Turkish which has lots of single character literal tests). + Groupings are now implemented via a Python set, or a string for small groupings. + Eliminate use of exception in code generated for `or`. We can instead wrap the code in a loop and use `break`. + Eliminate use of exception in `goto` and `gopast`. We can just use `break` here to exit the `while` loop we're also inside and move the `except` from the previous `try` onto the `while`. + Avoid using a temporary for `hop` with a constant argument as benchmarking with timeit shows this is faster. + Optimise string test by using startswith()/endswith() with suitable start/end parameters which avoids creating a temporary substring and avoids an explicit limit check. This speeds up artificial testcases consisting of `goto 'the'` by 10%. + Optimise among when all actions are `<-` with a literal string. We now generate a single call to slice_from() with the argument obtained by indexing into an array of literal strings. See #227. + Reduce overhead of code to forward to PyStemmer, both when forwarding and when using the pure Python stemmers. + Reuse exception classes much more. This reduces the number of labN classes we need by 142 over all the current stemmers. + Change slice_check() to assert its conditions. In C we must not perform string slicing if slice_check() fails because that could result in writing outside of the allocated buffer, but it's not problematic in this way for Python, and the situations which slice_check() checks for should only happen with a Snowball program containing logic errors, or for bugs in the Snowball compiler or its runtime (or possibly in the Python interpreter, OS, hardware, etc). Therefore assert() seems an appropriate choice. * Code quality: + Use _ as dummy loop variable. We don't use the loop variable's value, and the loop itself tracks the current iteration so generating nested loops using `_` as the loop variable works correctly. + Avoid mysterious gaps in the numbering of variables in the generated code. This was already done for the other languages, but I missed Python it seems. + Avoid generating unused lab0 class for a Snowball program which doesn't use any failure labels. + Avoid generating a blank line at start of the body of a Snowball `loop`. + stemwords.py: Replace deprecated `codecs.open()` with built-in `open()`. Patch from Dmitry Shachnev. * Documentation: + Remove unnecessary semicolons from Python code in docs. * Other changes: + Remove Python 2 support. We stopped officially supporting it in Snowball 2.1.0, but now we've actually stripped out support. Versions of Python ≥ 3.3 continue to be supported. Patch from Dmitry Shachnev (#212). Rust ---- * Bug fixes: + Fix code generated for string-$ (which isn't used by any of the algorithms we currently ship). + A snowball `external` could not previously be called from within the Snowball program. This is allowed by the Snowball language, but none of the shipped stemmers do this, and it's unlikely any stemmer would, but perhaps it's useful if you use Snowball for other string-processing tasks. + The generated code previously treated an empty string returned by slice_to() as an error, but this was buggy since if the slice is empty the return value will be an empty string. The helper doesn't try to signal an error with an empty string so we can just drop this check. Luckily this case is not exercised by any current algorithms. See #242. + Fix incorrect assumption that an among containing the empty string always matched, even if the empty string had a gating function. This construct is not used by any existing stemmers. * Optimisations: + Avoid calling among helper when the among contains only strings which are one byte long, no among functions are used, and there are no actions. * Code quality: + Fix formatting of code generated for `goto`/`gopast` applied to a grouping or inverted grouping. This is just a cosmetic problem - functionally it was correct. The poor formatting was introduced in v3.0.0. + Runtime helpers slice_del() and slice_from() always returned true, but the generated code included failure checks in case false was returned. These helpers no longer return anything, and the checks are gone. + Generate space after condition in integer test (purely cosmetic). New Code Generators ------------------- * Add Dart generator from Ryan Heise (#156, #250). * Add PHP generator from Tim Whitlock and Olly Betts (#243). Requires PHP 8.3 or later, which allows us to use typed class constants. * Add Zig backend from AJ Roetker. Requires Zig 0.16.0 or later. Snowball Language Changes ------------------------- * `delete` and `<-` now update the slice end. The manual said that after `[` and `]` "the slice ends will retain the same values until altered", which doesn't make it clear what happens for operations which modify the text the slice ends are in. The existing handling here was inconsistent between commands: `delete` and `<-` left the slice ends on the same numeric positions, while `attach` and `insert` adjusted the slice ends to leave the slice marking the equivalent substring of the updated string. When working in UTF-8 the slice end could end up in the middle of a multi-byte character after `delete` or `<-`, which seems especially undesirable. I talked this over with Martin Porter and we've agreed that it makes sense for `delete` and `<-` to also update the slice ends (in fact only the right end needs adjusting) and I've clarified the wording in the manual. Existing algorithms we ship don't rely on what the slice is set to after these commands. * The slice is now specified to be unset after `=S` (so the same state as at the start of the program). Previously Snowball attempted to adjust the slice after `=S`, but there isn't an obvious adjustment in general because it can replace part of the content of the slice. Martin said he'd not thought of this case, and we've concluded it's best to adjust the Snowball language definition. New stemming algorithms ----------------------- * Add Czech stemmer from Olly Betts and Jim O’Regan (#151). * Add Persian (Farsi) stemmer from Saeid Darvish (#181). * Add Polish stemmer from Dmitry Shachnev (#245). * Add Sesotho stemmer from Kamohelo Lebjane (#260). Behavioural changes to existing algorithms ------------------------------------------ * Danish: + Adjust to handle apostrophe (#187). + Restrict undoubling to valid cases Coverage showed that a number of the consonants we would undouble never occur in our Danish vocabulary. Testing a larger list didn't find any matches for Danish words either, so restrict the undoubling which reduces the potential for damage to foreign words and should be a little more efficient. * English: + Restore exception for `skis` so it stems to `ski`. This reverts a change made erroneously in Snowball 3.0.0. + Improve the stemming of some words starting `inter`: - We now avoid conflating intern, internal, international and internment. - We now conflate interfere/interferes/interference with interfered/interfering. - The stem of `interval` is now `interval` rather than `interv`, which is mostly a cosmetic change as no unrelated words stem to `interv`. * Estonian: + Handle apostrophe (#187). * Finnish: + Handle apostrophe (#187). + Improve fallback from illative rules. If a word ends -han, -hen, -hin, -hon, -hän or -hön but the vowel before does not match we were not removing a suffix in case_ending, we now fallback to handling as a genitive and remove -n. This changes how we handle about 90 words - almost all for the better, most of the rest seem neutral changes. + Allow "ø" to match with -hön as this is seen with Norwegian place names, e.g. Bodøhön. + Remove illative form -hun. This improves the stemming of 14 words in our test vocabulary. * German: + Handle apostrophe (#187). * Italian: + Handle elisions (#187). * Lithuanian: + Don't remove -er- before normal suffixes. These aren't real grammatic suffixes and seem to have been included mainly to try to conflate ancient forms of the Lithuanian word for "sister" (e.g. "sesers") with modern forms (e.g. "sesė"). We weren't even doing a complete job there however as "seserimis" and "seseris" were not handled. Removing these suffixes entirely means we no longer try to conflate the ancient and modern forms here, but at least all the forms of the old word get grouped, as do all forms of the new word. The stemming for ~150 other words is also improved, without obvious downsides. Patch from Justas Sakalauskas (#263). + Remove trailing apostrophe as final step - an apostrophe is sometimes used to separate a Lithuanian ending on an international word (#187). * Norwegian: + Adjust to handle apostrophe (#187). * Polish: + Remove optional apostrophe after removing suffix. Polish uses an apostrophe to separate loanwords from native suffixes. (The correct use is to mark the elision of the final sound of a loanword before a Polish inflectional endings, but it's also often used with any loanword) (#187). Optimisations to existing algorithms ------------------------------------ * English: + Optimise -eed, -eedly handling by performing the much cheaper R1 check before the among of exceptional cases. * Esperanto: + Eliminate use of among functions. It's easy to avoid them, and they come with a performance overhead in some target languages. For C, the new version is 0.09% faster (from cachegrind estimated cycle count). * Indonesian: + Avoid use of among functions, which gives a 1.9% speed up for C (from cachegrind estimated cycle count). * Lithuanian: + Minor simplification/optimisation by relying on Snowball restoring the cursor on failure. * Turkish: + Simplify `not test C` to just `not C`. If C succeeds, then the `not` fails and the cursor will get restored by whatever handles that signal. Code clarity improvements to existing algorithms ------------------------------------------------ * Finnish: Rename `V1` and `LONG` to match the names used in the algorithm description on the website. * Italian: Eliminate use of legacy among starter. Build system ------------ * The default flags used with `ar` are now `-cr` instead of `-cru`. Many Linux distros configure `ar` to use option `D` (deterministic mode) by default, which was triggering a warning that option `u` is ignored. Option `u` is just a minor optimisation for the case where the archive already exist and only some object files have change, so it seems best to just not try to use it and avoid the warning. Make variable `ARFLAGS` can now be used to specify flags to use with `ar`, so if you want to continue using `-cru`, you can use: make ARFLAGS=-cru If `D` is on by default in your `ar`, you'll actually want: make ARFLAGS=-cruU * Add comment documenting how to use iconv.py (simple pure-Python alternative which allows running the testsuite without iconv installed). * `make clean` now removes all built files for all target languages, and is now tested by CI to ensure this doesn't regress. * Make "make check_utf8" parallel-safe by avoiding writing the stemmed output to disk by default (except for Arabic). To get the output saved as tmp.txt on error for debugging you can now use: `make SAVETMP=1 check_utf8`. Patch from Adam Turner (#237, #238). * Ada: Fix parallel build by adding missing dependency from .adb to the corresponding .sbl file (#237, #238). * Go: Use `$(go)` for `go generate` as well. * Python: Omit output "(THIN_FACTOR=)" if set empty. * Add SNOWBALL_FLAGS, intended to allow passing options such as `-comments` and `-coverage` during development and debugging. * Add make targets to assist comparing generated code before and after a compiler change: `baseline-create`, `generate` and `baseline-diff`. * We now have CI testing that the Snowball compiler builds as C99 (we were already testing that the generated C code builds as C90). Fixes #283, reported by Domingo Alvarez Duarte. Testsuite --------- * New testsuite for the Snowball compiler which tests parsing, errors and warnings. * New runtime testsuite which tests the implementation of Snowball language features in each supported target language. These provide something much more like a proper set of unit tests rather than relying on checking all the algorithms produce the expected output to validate all the target language generators. These tests are run with -comments on to provide some test coverage for this option. Fixes #157. * stemtest: Add more number testcases, relocated to here from finnish/voc.txt. They're better by stemtest as we want to avoid any stemmer damaging numbers, and testcases here can easily be run for all stemmers. Snowball 3.0.1 (2025-05-09) =========================== Python ------ * The __init__.py in 3.0.0 was incorrectly generated due to a missing build dependency and the list of algorithms was empty. First reported by laymonage. Thanks to Dmitry Shachnev, Henry Schreiner and Adam Turner for diagnosing and fixing. (#229, #230, #231) * Add trove classifiers for Armenian and Yiddish which have now been registered with PyPI. Thanks to Henry Schreiner and Dmitry Shachnev. (#228) * Update documented details of Python 2 support in old versions. Snowball 3.0.0 (2025-05-08) =========================== Ada --- * Bug fixes: + Fix invalid Ada code generated for Snowball `loop` (it was partly Pascal!) None of the stemmers shipped in previous releases triggered this bug, but the Turkish stemmer now does. + The Ada runtime was not tracking the current length of the string but instead used the current limit value or some other substitute, which manifested as various incorrect behaviours for code inside of `setlimit`. + `size` was incorrectly returning the difference between the limit and the backwards limit. + `lenof` or `sizeof` on a string variable generated Ada code that didn't even compile. + Fix incorrect preconditions on some methods in the runtime. + Fix bug in runtime code used by `attach`, `insert`, `<-` and string variable assignment when a (sub)string was replaced with a larger string. This bug was triggered by code in the Kraaij-Pohlmann Dutch stemmer implementation (which was previously not enabled by default but is now the standard Dutch stemmer). + Fix invalid code generated for `insert`, `<-` and string variable assignment. This bug was triggered by code in the Kraaij-Pohlmann Dutch stemmer implementation (which was previously not enabled by default but is now the standard Dutch stemmer). + Generate valid code for programs which don't use `among`. This didn't affect code generation for any algorithms we currently ship. + If the end of a routine was unreachable code the Snowball compiler would think the start of the next routine was also unreachable and would not generate it. This didn't affect code generation for any algorithms we currently ship. * Code quality: + Only declare variables A and C when each is needed. + Fix indentation of generated declarations. + Drop extra blank line before `Result := True`. C/C++ ----- * Bug fixes: + Fix potential NULL dereference in runtime code if we failed to allocate memory for the p or S member for a Snowball program which uses one or more string variables. Problem was introduced in Snowball 2.0.0. Fixes #206, reported by Maxim Korotkov. + Fix invalid C code generated when a failure is handled in a context with the opposite direction to where it happened, for example: externals (stem) define stem as ( try backwards 'x' ) This was fixed by changing the C generator to work like all the other generators and pre-generate the code to handle failure. + Eliminate assumptions that NULL has all-zero bit pattern. We don't know of any current platforms where this assumption fails, but the C standard doesn't require an all-zero bit pattern for NULL. Fixes #207. * Optimisations: + Store index delta for among substring_i field. This makes trying substrings after a failed match slightly faster because we can just add the offset to the pointer we already have to the current element. * Code quality: + Improve formatting of generated code. C# -- * Bug fixes: + Add missing runtime support for testing for a string var at the current position when working forwards. This situation isn't exercised by any of the stemming algorithms we currently ship. + Adjust generated code to work around a code flow analysis bug in the `mcs` C# compiler. * Code quality: + Prune unused `using System.Text;`. + Generate C# with UTF-8 source encoding. This makes the generated code easier to follow, which helps during development. It's also a bit smaller. For now codepoints U+0590 and above are still emitted as escape sequences to avoid confusing source code rendering when LTR scripts are involved. Go -- * Optimisations: + Drop some unneeded Go code generated for string `$`. None of the shipped stemmers use string `$`, though the Schinke Latin stemmer algorithm on the website does. * Code quality: + Dispatch among result with `switch` instead of an `if` ... `else if` chain (which looks like we did because the Go generator evolved from the Python generator and Python didn't used to have a switch-like construct. This doesn't make a measurable speed difference so it seems the Go compiler is optimising both to equivalent code, but using a switch here seems clearer, a better match for the intent, and is a bit simpler to generate. + Generate Go with UTF-8 source encoding. This makes the generated code easier to follow, which helps during development. It's also a bit smaller. For now codepoints U+0590 and above are still emitted as escape sequences to avoid confusing source code rendering when LTR scripts are involved. Java ---- * The Java code generated by Snowball requires now requires Java >= 7. Java 7 was released in 2011, and Java 6's EOL was 2013 so we don't expect this to be a problematic requirement. See #195. * Optimisations: + We now store the current string in a `char[]` rather than using a `StringBuilder` to reduce overheads. The `getCurrent()` method continues to return a Java `String`, but the `char[]` can be accessed using the new `getCurrentBuffer()` and `getCurrentBufferLength()` methods. Patch from Robert Muir (#195). + Use a more efficient mechanism for calling `among` functions. Patch from Robert Muir (#195). * Code quality: + Consistently put `[]` right after element type for array types, which seems the most used style. + Fix javac warnings in SnowballProgram.java. + Improve formatting of generated code. Javascript ---------- * Bug fixes: + Use base class specified by `-p` in string `$` rather than hard-coding `BaseStemmer` (which is the default if you don't specify `-p`). None of the shipped stemmers use string `$`, though the Schinke Latin stemmer algorithm on the website does. * Code quality: + Modernise the generated code a bit. Loosely based on changes proposed in #123 by Emily Marigold Klassen. * Other changes: + The Javascript runner is now specified by make variable `JSRUN` instead of `NODE` (since node is just one JS implementation). The default value is now `node` instead of `nodejs` (older Debian and Ubuntu packages used `/usr/bin/nodejs` because `/usr/bin/node` was already in use by a completely different package, but that has since changed). Pascal ------ * Bug fixes: + Add missing semicolons to code generated in some cases for a function which always succeeds or always fails. The new dutch.sbl was triggering this bug. + If the end of a routine was unreachable code the Snowball compiler would think the start of the next routine was also unreachable and would not generate it. This didn't affect code generation for any algorithms we currently ship. * Code quality: + Eliminate commented out code generated for string `$`. None of the shipped stemmers use string `$`, though the Schinke Latin stemmer algorithm on the website does. * Other changes: + Enable warnings, etc from fpc. + Select GNU-style diagnostic format. Python ------ * Optimisations: + Use Python set for grouping checks. This speeds up running the Python testsuite by about 4%. + Routines used in `among` are now referenced by name directly in the generated code, rather than using a string containing the name. This avoids a `getattr()` call each time an among wants to call a routine. This doesn't seem to make a measurable speed difference, but it's cleaner and avoids problems with name mangling. Suggested by David Corbett in #217. + Simplify code generated for `loop`. If the iteration count is constant and at most 4 then iterate over a tuple which microbenchmarking shows is faster. The only current uses of loop in the shipped stemmers are `loop 2` so benefit from this. Otherwise we now use `range(AE)` instead of `range (AE, 0, -1)` (the actual value of the loop variable is never used so only the number of iterations matter). * Bug fixes: + Correctly handle stemmer names with an underscore. * Code quality: + Generate Python with UTF-8 source encoding. This makes the generated code easier to follow, which helps during development. It's also a bit smaller. For now codepoints U+0590 and above are still emitted as escape sequences to avoid confusing source code rendering when LTR scripts are involved. * Other changes: + Set python_requires to indicate to install tools that the generated code won't work with Python 3.0.x, 3.1.x and 3.2.x (due to use of `u"foo"` string literals). Closes #192 and #191, opened by Andreas Maier. + Add classifiers to indicate support for Python 3.3 and for 3.8 to 3.13. Fixes #158, reported by Dmitry Shachnev. + Stop marking the wheel as universal, which had started to give a warning message. Patch from Dmitry Shachnev (#210). + Stop calling `setup.py` directly which is deprecated and now produces a warning - use the `build` module instead. Patch from Dmitry Shachnev (#210). Rust ---- * Optimisations: + Shortcut unnecessary calls to find_among, porting an optimization from the C generator. In some stemming benchmarks this improves the performance of the rust english stemmer by about 27%. Patch from jedav (#202). * Code quality: + Suppress unused_parens warning, for example triggered by the code generated for `$x = x*x` (where `x` is an integer). + Dispatch `among` result with `match` instead of an `if` ... `else if` chain (which looks like we did because the Rust generator evolved from the Python generator and Python didn't used to have a switch-like construct. This results in a 3% speed-up for an unoptimised Rust compile but doesn't seem to make a measurable difference when optimising so it seems the Rust compiler is optimising both to equivalent code. However using a `match` here seems clearer, a better match for the intent, and is a bit simpler to generate. + Generate Rust with UTF-8 source encoding. This makes the generated code easier to follow, which helps during development. It's also a bit smaller. For now codepoints U+0590 and above are still emitted as escape sequences to avoid confusing source code rendering when LTR scripts are involved. New stemming algorithms ----------------------- * Add Esperanto stemmer from David Corbett (#185). * Add Estonian algorithm from Linda Freienthal (#108). Behavioural changes to existing algorithms ------------------------------------------ * Dutch: Switch to Kraaij-Pohlmann as the default for Dutch. In case you want Martin Porter's Dutch stemming algorithm for compatibility, this is now available as `dutch_porter`. Fixes #1, reported by gboer. * Dutch (Kraaij-Pohlmann): Fix differences between the Snowball implementation and the original C implementation. * Dutch (Kraaij-Pohlmann): Add a small number of exceptions to the Snowball implementation to avoid unwanted conflations. This addresses all cases so far identified which Martin's Dutch stemmer handled better. Fixes #208. * Dutch (Porter): The "at least 3 characters" part of the R1 definition was actually implemented such that when working in UTF-8 it was "at least 3 bytes". We stripped accents normally found in Dutch except for `è` before setting R1, and no Dutch words starting `è` seem to stem differently depending on encoding, but proper nouns and other words of foreign origin may contain other accented characters and it seems better for the stemmer to handle such words the same way regardless of the encoding in use. * English: Replace '-ogist' with '-og' to conflate "geologist" and "geology", etc. Suggested by Marc Schipperheijn on snowball-discuss. * English: Add extra condition to undoubling. We no longer undouble if the double consonant is preceded by exactly "a", "e" or "o" to avoid conflating "add"/"ad", "egg"/"eg", "off"/"of", etc. Fixes #182, reported by Ed Page. * English: Avoid conflating 'emerge' and 'emergency'. Reported by Frederick Ross on snowball-discuss. * English: Avoid conflating 'evening' and 'even'. Reported by Ann B on snowball-discuss. * English: Avoid conflating 'lateral' and 'later'. Reported by Steve Tolkin on snowball-discuss. * English: Avoid conflating 'organ', 'organic' and 'organize'. * English: Avoid conflating 'past' and 'paste'. Reported by Sonny on snowball-discuss. * English: Avoid conflating 'universe', 'universal' and 'university'. Reported by Clem Wang on snowball-discuss. * English: Handle -eed and -ing exceptions in their respective rules. This avoids the overhead of checking for them for the majority of words which don't end -eed or -ing. It also allows us to easily handle vying->vie and hying->hie at basically no extra cost. Reduces the time to stem all words in our English word list by nearly 2%. * French: Remove elisions as first step. See #187. Originally reported by Paul Rudin and kelson42. * French: Remove -aise and -aises so for example, "française" and "françaises" are now conflated with "français". Fixes #209. Originally reported by ririsoft and Fred Fung. * French: Avoid incorrect conflation of `mauvais` (bad) with `mauve` (mauve, mallow or seagull); avoid conflating `mal` with `malais`, `pal` with `palais`, etc. * French: Avoid conflating `ni` (neither/nor) with `niais` (inexperienced/silly) and `nie`/`nié`/`nier`/`nierais`/`nierons` (to deny). * French: -oux -> -ou. Fixes #91, reported by merwok. * German: Replace with the "german2" variant. This normalises umlauts ("ä" to "ae", "ö" to "oe", "ü" to "ue") which is presumably much less common in newly created text than it once was as modern computer systems generally don't have the limitations which motivated this, but there will still be large amounts of legacy text which it seems helpful for the stemmer to handle without having to know to select a variant. On our sample German vocabulary which contains 35033 words, 77 words give different stems. A significant proportion of these are foreign words, and some are proper nouns. Some cases definitely seem improved, and quite a few are just different but effectively just change the stem for a word or group of words to a stem that isn't otherwise generated. There don't seem any changes that are clearly worse, though there are some changes that have both good and bad aspects to them. Fixes #92, reported by jrabensc. * German: Don't remove -em if preceded by -syst to avoid overstemming words ending -system. This change means we now conflate e.g. "system" and "systemen". Partly addresses #161, reported by Olga Gusenikova. * German: Remove -erin and -erinnen suffixes which conflates singular and plural female versions of nouns with the male versions. Fixes #85 and partly addresses #161, reported by Olga Gusenikova. * German: Replace -ln and -lns with -l. This improves 82 cases in the current sample data without making anything worse. Tests on a larger word list look good too. Partly addresses #161, reported by Olga Gusenikova. * German: Remove -et suffix when we safely can. Fixes #200, reported by Robert Frunzke. * Greek: Fix "faulty slice operation" for input `ισαισα`. The fix changes `ισα` to stem to `ισ` instead of the empty string, which seems better (and to be what the second paper actually says to do if read carefully). Fixes #204, reported by subnix. * Italian: Address overstemming of "divano" (sofa) which previously stemmed to "div", which is the stem for 'diva' (diva). Now it is stemmed to 'divan', which is what its plural form 'divani' already stemmed to. Fixes #49, reported by francesco. * Norwegian: Improve stemming of words ending -ers. Fixes #175, reported by Karianne Berg. * Norwegian: Include more accented vowels - treating "ê", "ò", "ó" and "ô" as vowels improves the stemming of a fairly small number of words, but there's basically no cost to having extra vowels in the grouping, and some of these words are commonly used. Fixes #218, reported by András Jankovics. * Romanian: Fix to work with Romanian text encoded using the correct Unicode characters. Romanian uses a "comma below" diacritic on letters "s" and "t" ("ș" and "ț"). Before Unicode these weren't easily available so Romanian text was written using the visually similar "cedilla" diacritic on these letters instead ("ş" and "ţ"). Previously our stemmer only recognised the latter. Now it maps the cedilla forms to "comma below" as a first step. Patch from Robert Muir. * Spanish: Handle -acion like -ación and -ucion like -ución. It's apparently common to miss off accents in Spanish, and there are examples in our test vocabulary that these changes help. Proposed by Damian Janowski. * Swedish: Replace suffix "öst" with "ös" when preceded by any of 'iklnprtuv' rather than just 'l'. The new rule only requires the "öst" to be in R1 whereas previously we required all of "löst" to be. This second tweak doesn't seem to affect any words ending "löst" but it conflates a few extra cases when combined with the expanded list of preceding letters, and seems more logical linguistically (since "ös" is akin to "ous" in English). Fixes #152, reported by znakeeye. * Swedish: Remove -et/-ets in cases where it helps. Removing -et can't be done unconditionally because many words end in -et where this isn't a suffix. However it's a very common suffix so it seems worth crafting a more complex condition under which to remove. Fixes #47. * Turkish: Remove proper noun suffixes. For example, `Türkiye'dir` ("it is Turkey") is now conflated with `Türkiye` ("Turkey"). Fixes #188. * Yiddish: Avoid generating empty stem for input "גע" (not a valid word, but it's better to avoid an empty stem for any non-empty input). Optimisations to existing algorithms ------------------------------------ * General change: Use `gopast` everywhere to establish R1 and R2 as it is a little more efficient to do so. * Basque: Use an empty action rather than replacing the suffix with itself which seems clearer and is a little more efficient. * Dutch (Porter): Optimise prelude routine. * English: Remove unnecessary exception for `skis` as the algorithm stems `skis` to `ski` by itself (`skies` and `sky` do still need a special case to avoid conflation with `ski` though). * Hungarian: We no longer take digraphs into account when determining where R1 starts. This can only make a difference to the stemming if we removed a suffix that started with the last character of the digraph (or with "zs" in the case of "dzs"), and that doesn't happen for any of the suffixes we remove for any valid Hungarian words. This simplification speeds up stemming by ~2% on the current sample vocabulary list. See #216. Thanks to András Jankovics for confirming no Hungarian words are affected by this change. * Lithuanian: Remove redundant R1 check. * Nepali: Eliminate redundant check_category_2 routine. * Tamil: Optimise by using `among` instead of long `or` chains. The generated C version now takes 43% less time to processes the test vocabulary. * Tamil: Remove many cases which can't be triggered due to being handled by another case. * Tamil: Clean up some uses of `test`. * Tamil: Make `fix_va_start` simpler and faster. * Tamil: Localise use of `found_a_match` flag. * Tamil: Eliminate pointless flag changes. * Turkish: Minor optimisations. Code clarity improvements to existing algorithms ------------------------------------------------ * Stop noting dates changes were made in comments in the code - we now maintain a changelog in each algorithm's description page on the website (and the version control history provides a finer grained view). * Always use `insert` instead of `<+` as the named command seems clearer. * English: Add comments documenting motivating examples for all exceptional cases. * Lithuanian: Change to recommended latin stringdef codes. Using common codes makes it easier to work across algorithms, but they are more mnemonic so also seem clearer when just considering this one algorithm. * Serbian: Change to recommended latin stringdef codes. Using common codes makes it easier to work across algorithms, but they are more mnemonic so also seem clearer when just considering this one algorithm. * Turkish: Use `{sc}` for s-cedilla and `{i}` for dotless-i to match other uses. Compiler -------- * Generic code generation improvements: + Show Snowball source leafname in "generated" comment at start of files. + Add generic reachability tracking machinery. This facilitates various new optimisations, so far the following have been implemented: - Tail-calling - Simpler code for calling routines which always give the same signal - Simpler code when a routine ends in a integer test (this also allows eliminating an Ada-specific codegen optimisation which did something similar but only for routines which consisted *entirely* of a single integer test. - Dead code reporting and removal (only in simple cases currently) Currently this overlaps in functionality with the existing reachability tracking which is implemented on a per-language basis, and only for some languages. This reachability tracking was originally added for Java where some unreachable code is invalid and result in a compile time error, but then seems to have been copied for some other newer languages which may or may not actually need it. The approach it uses unfortunately relies on correctly updating the reachability flag anywhere in the generator code where reachability can change which has proved to be a source of bugs, some unfixed. This new approach seems better and with some more work should allow us to eliminate the older code. Fixes #83. + Omit check for `among` failing in generated code when we can tell at compile time that it can't fail. + Optimise `goto`/`gopast` applied to a grouping or inverted grouping (which is by far the most common way to use `goto`/`gopast`) for all target languages (new for Go, Java, Javascript, Pascal and Rust). + We never need to restore the cursor after `not`. If `not` turns signal `f` into `t` then it sets `c` back to its old position; otherwise, `not` signals `f` and `c` will get reset by whatever ultimately handles this `f` (or the program exits and the position of `c` no longer matters). This slightly improves the generated code for the `english` and `porter` stemmers. + Don't generate code for undefined or unused routines. + Avoid generating variable names and then not actually using them. This eliminates mysterious gaps in the numbering of variables in the generated code. + Eliminate `!`/`not` from integer test code by generating the inverse comparison operator instead for all languages, e.g. for Python we now generate if self.I_p1 >= self.I_x: instead of if not self.I_p1 < self.I_x: This isn't going to be faster in compiled languages with an optimiser but for scripting languages it may be faster, and even if not, it makes for a little less work when loading the script. + Canonicalise `hop 1` to `next` as the generated code for `next` can be slightly more efficient. This will also apply to `hop` followed by a constant expression which Snowball can reduce to `1`. + Avoid trailing whitespace in generated files. + Fix problems with --comments option: - When generating C code we would segfault for code containing `atleast`, `hop` or integer tests. - Fix missing comments for some commands in some target languages. - Fix inconsistent formatting of comments in some target languages. - Comments in C are now always on their own line - previously some were after at the end of the line and some on their own line which made them harder to follow. - Emit comments before `among` and before routine/external definitions. + Simplify more cases of numeric expressions (e.g. `x * 1` to `x`). * Improve --help output. * Division by zero during constant folding now gives an error. * For `hop` followed by an unexpected token (e.g. `hop hop`) we were already emitting a suitable error but would then segfault. * Emit error for redefinition of a grouping. * Improve errors for `define` of an undeclared name. We already peek at the next token to decide whether to try to parse as a routine or grouping. Previously we parsed as a routine if it was `as`, and a grouping otherwise, but routine definitions are more common and a grouping can only start with a literal string or a name, so now we assume a routine definition with a missing `as` if the next token isn't valid for either. * Suppress duplicate (or even triplicate) "unexpected" errors for the same token when the compiler tried to recover from the error by adjusting the parse stare and marking the token to be reparsed, but the same token then failed to parse in the new state. * Fix NULL pointer dereference if an undefined grouping is used in the definition of another grouping. * Fix mangled error for `set` or `unset` on a non-boolean: test.sbl:2: nameInvalid type 98 in name_of_type() * Emit warning if `=>` is used. The documentation of how it works doesn't match the implementation, and it seems it has only ever been used in the Schinke stemmer implementation (which assumes the implemented behaviour). We've updated the Schinke implementation to avoid it. If you're using it in your own Snowball code please let us know. * Improve errors for unterminated string literals. * Fix NULL pointer dereference on invalid code such as `$x = $y`. * If malloc fails while compiling the compiler will now report the failure and exit. Previously the NULL return from malloc wasn't checked for so we'd typically segfault. * `lenof` and `sizeof` applied to a string variable now mark the variable as used, which avoids a bogus error followed by a confusing additional message if this is the only use of that variable: lenofsizeofbug.sbl:3: warning: string 's' is set but never used Unhandled type of dead assignment via sizeof This is situation is unlikely to occur in real world code. * The reported line number for "string not terminated" error was one too high in the case where we were in a stringdef (but correct if we weren't). * Eliminate special handling for among starter. We now convert the starter to be a command before the among, adding an explict substring if there isn't one. * We now warn if the body of a `repeat` or `atleast` loop always signals `t` (meaning it will loop forever which is very undesirable for a stemming algorithm) or always signals `f` (meaning it will never loop, which seems unlikely to be what was intended). * Release memory in compiler before exit. The OS will free all allocated memory when a process exits, so this memory isn't actually leaked, but it can be annoying with when using snowball as part of a larger build process with some leak-finding tools. Patch from jsteemann in #166. * Store textual data more efficiently in memory during Snowball compilation. Previously almost all textual data was stored as 16 bit values, but most such data only uses 8 bit character values. Doubling the memory usage isn't really an issue as Snowball programs are tiny, but this also complicated code handling such data. Now only literal strings use the 16 bit values. * Fix clang -Wunused-but-set-variable warning in compiler code. * Fix a few -Wshadow warnings in compiler and enable this warning by default. * Tighten parsing of `writef()` format strings. We now error out on unrecognised escape codes or if a numbered escape is used with too high a number or a non-digit. This change reveals that the Go and Rust generators were using invalid escape ~A - the old writef() code was substituting this with just A which is what is wanted so this case was harmless but being lenient here could hide bugs, especially when copying code between generators as they don't all support the same set of format codes. Build system ------------ * Turn on Java warnings and make them errors. * Compile C code with -g by default. This makes debugging easier, and matches the default for at least some other build systems (e.g. autotools). * Fix "make clean" to remove all built Ada files. * Clean `stemtest` too. Patch from Stefano Rivera. * Add missing `COMMON_FILES` dependency to dist targets. * GNUmakefile: Tidy up and make more consistent * GNUmakefile: Make use of $* to improve speed and readability. * Use $(patsubst ...) instead of sed in .java.class rule which gives cleaner make output and is a bit more efficient. * Add `WERROR` make variable to provide a way to add `-Werror` to existing CFLAGS. libstemmer ---------- Testsuite --------- * Give a clear error if snowball-data isn't found. Fixes #196, reported by Andrea Maccis. * Handle not thinning testdata better. If THIN_FACTOR is set to 1 we no longer run gzipped test data through awk. We also now handle THIN_FACTOR being set empty as equivalent to 1 for convenience. * csharp_stemwords: Correctly handle a stemmer name containing an underscore. * csharp_stemwords: Make `-i` option optional and read from stdin if omitted, like the C version does. * csharp_stemwords: Process the input line by line which is more helpful for interactive testing, and also a little faster. * Fix Java TestApp to allow a single argument. The documented command line syntax is that you only need to specify the language and there was already code to read from stdin if no input file was specified, but at least two command line options were required. * Fix deprecation warning in TestApp.java. * Optimise TestApp.java by creating fewer objects. Patch from Robert Muir. * stemwords.py: We no longer create an empty output file if we fail to open the input file. * stemwords: Improve error message to say "Out of memory or internal error" rather than just "Out of memory". Documentation ------------- * Include "what is stemming" section in each README. * Include section on threads in each README. Based on patch for Python from dbcerigo. * Document that input should be lowercase with composed accents. See #186, reported by 1993fpale. * Add README section on building, including notes on cross-compiling. Fixes #205, reported by sin-ack. * CONTRIBUTING.rst: Clarify which charsets to list * CONTRIBUTING.rst: Add general advice section. In particular, note to use spaces-only for indentation in most cases. Thanks to Dmitry Shachnev for raising this point. * CONTRIBUTING.rst: Note that UTF-8 is OK in comments. Thanks to Dmitry Shachnev for asking. * Fix some typos. Patch from Josh Soref. * Document that our CI now uses github actions. * Update link to Greek stemmer PDF. Patch from Michael Bissett (#33). Snowball 2.2.0 (2021-11-10) =========================== New Code Generators ------------------- * Add Ada generator from Stephane Carrez (#135). Javascript ---------- * Fix generated code to use integer division rather than floating point division. Noted by David Corbett. Pascal ------ * Fix code generated for division. Previously real division was used and the generated code would fail to compile with an "Incompatible types" error. Noted by David Corbett. * Fix code generated for Snowball's `minint` and `maxint` constant. Python ------ * Python 2 is no longer actively supported, as proposed on the mailing list: https://lists.tartarus.org/pipermail/snowball-discuss/2021-August/001721.html * Fix code generated for division. Previously the Python code we generated used integer division but rounded negative fractions towards negative infinity rather than zero under Python 2, and under Python 3 used floating point division. Noted by David Corbett. Code quality Improvements ------------------------- * C/C++: Generate INT_MIN and INT_MAX directly, including from the generated C file if necessary, and remove the MAXINT and MININT macros from runtime/header.h. * C#: An `among` without functions is now generated as `static` and groupings are now generated as constant. Patches from James Turner in #146 and #147. Code generation improvements ---------------------------- * General: + Constant numeric subexpressions and constant numeric tests are now evaluated at Snowball compile time. + Simplify the following degnerate `loop` and `atleast` constructs where N is a compile-time constant: - loop N C where N <= 0 is a no-op. - loop N C where N == 1 is just C. - atleast N C where N <= 0 is just repeat C. If the value of N doesn't depend on the current target language, platform or Unicode settings then we also issue a warning. Behavioural changes to existing algorithms ------------------------------------------ * german2: Fix handling of `qu` to match algorithm description. Previously the implementation erroneously did `skip 2` after `qu`. We suspect this was intended to skip the `qu` but that's already been done by the substring/among matching, so it actually skips an extra two characters. The implementation has always differed in this way, but there's no good reason to skip two extra characters here so overall it seems best to change the code to match the description. This change only affects the stemming of a single word in the sample vocabulary - `quae` which seems to actually be Latin rather than German. Optimisations to existing algorithms ------------------------------------ * arabic: Handle exception cases in the among they're exceptions to. * greek: Remove unused slice setting, handle exception cases in the among they're exceptions to, and turn `substring ... among ... or substring ... among ...` into a single `substring ... among ...` in cases where it is trivial to do so. * hindi: Eliminate the need for variable `p`. * irish: Minor optimisation in setting `pV` and `p1`. * yiddish: Make use of `among` more. Compiler -------- * Fix handling of `len` and `lenof` being declared as names. For compatibility with programs written for older Snowball versions len and lenof stop being tokens if declared as names. However this code didn't work correctly if the tokeniser's name buffer needed to be enlarged to hold the token name (i.e. 3 or 5 elements respectively). * Report a clearer error if `=` is used instead of `==` in an integer test. * Replace a single entry command list with its contents in the internal syntax tree. This puts things in a more canonical form, which helps subsequent optimisations. Build system ------------ * Support building on Microsoft Windows (using mingw+msys or a similar Unix-like environment). Patch from Jannick in #129. * Split out INCLUDES from CPPFLAGS so that CPPFLAGS can now be overridden by the user if required. Fixes #148, reported by Dominique Leuenberger. * Regenerate algorithms.mk only when needed rather than on every `make` run. libstemmer ---------- * The libstemmer static library now has a `.a` extension, rather than `.o`. Patch from Michal Vasilek in #150. Testsuite --------- * stemtest: Test that numbers and numeric codes aren't damaged by any of the algorithms. Regression test for #66. Fixes #81. * ada: Fix ada tests to fail if output differs. There was an extra `| head -300` compared to other languages, which meant that the exit code of `diff` was ignored. It seems more helpful (and is more consistent) not to limit how many differences are shown so just drop this addition. * go: Stop thinning testdata. It looks like we only are because the test harness code was based on that for rust, which was based on that for javascript, which was only thinning because it was reading everything into memory and the larger vocabulary lists were resulting in out of memory issues. * javascript: Speed up stemwords.js. Process input line-by-line rather than reading the whole file into memory, splitting, iterating, and creating an array with all the output, joining and writing out a single huge string. This also means we can stop thinning the test data for javascript, which we were only doing because the huge arabic test data file was causing out of memory errors. Also drop the -p option, which isn't useful here and complicates the code. * rust: Turn on optimisation in the makefile rather than the CI config. This makes the tests run in about 1/5 of the time and there's really no reason to be thinning the testdata for rust. Documentation ------------- * CONTRIBUTING.rst: Improve documentation for adding a new stemming algorithm. * Improve wording of Python docs. Snowball 2.1.0 (2021-01-21) =========================== C/C++ ----- * Fix decoding of 4-byte UTF-8 sequences in `grouping` checks. This bug affected Unicode codepoints U+40000 to U+7FFFF and U+C0000 to U+FFFFF and doesn't affect any of the stemming algorithms we currently ship (#138, reported by Stephane Carrez). Python ------ * Fix snowballstemmer.algorithms() method (#132, reported by kkaiser). * Update code to generate trove language classifiers for PyPI. All the natural languages we previously had stemmers for have now been added to PyPI's list, but Armenian and Yiddish aren't on it. Patch from Dmitry Shachnev. Code Quality Improvements ------------------------- * Suppress GCC warning in compiler code. * Use `const` pointers more in C runtime. * Only use spaces for indentation in javascript code. Change proposed by Emily Marigold Klassen in #123, and seems to be the modern Javascript norm. New Snowball Language Features ------------------------------ * `lenof` and `sizeof` can now be applied to a literal string, which can be useful if you want to do calculations on cursor values. This change actually simplifies the language a little, since you can now use a literal string in any read-only context which accepts a string variable. Code generation improvements ---------------------------- * General: + Fix bugs in the code generated to handle failure of `goto`, `gopast` or `try` inside `setlimit` or string-`$`. This affected all languages (though the issue with `try` wasn't present for C). These bugs don't affect any of the stemming algorithms we currently ship. Reported by Stefan Petkovic on snowball-discuss. + Change `hop` with a negative argument to work as documented. The manual says a negative argument to hop will raise signal f, but the implementation for all languages was actually to move the cursor in the opposite direction to `hop` with a positive argument. The implemented behaviour is problematic as it allows invalidating implicitly saved cursor values by modifying the string outside the current region, so we've decided it's best to fix the implementation to match the documentation. The only Snowball code we're aware of which relies on this was the original version of the new Yiddish stemming algorithm, which has been updated not to rely on this. The compiler now issues a warning for `hop` with a constant negative argument (internally now converted to `false`), and for `hop` with a constant zero argument (internally now converted to `true`). + Canonicalise `among` actions equivalent to `()` such as `(true)` which previously resulted in an extra case in the among, and for Python we'd generate invalid Python code (`if` or `elif` with an empty body). Bug revealed by Assaf Urieli's Yiddish stemmer in #137. + Eliminate variables whose values are never used - they no longer have corresponding member variables, etc, and no code is generated for any assignments to them. + Don't generate anything for an unused `grouping`. + Stop warning "grouping X defined but not used" for a `grouping` which is only used to define another `grouping`. * C/C++: + Store booleans in same array as integers. This means each boolean is stored as an int instead of an unsigned char which means 4 bytes instead of 1, but we save a pointer (4 or 8 bytes) in struct SN_env which is a win for all the current stemmers. For an algorithm which uses both integers and booleans, we also save the overhead of allocating a block on the heap, and potentially improve data locality. + Eliminate duplicate generated C comment for sliceto. * Pascal: + Avoid generating unused variables. The Pascal code generated for the stemmers we ship is now warning free (tested with fpc 3.2.0). + Don't emit empty `private` sections. Cosmetic, but makes the generated code a bit easier to follow. * Python: + End `if`-chain with `else` where possible, avoiding a redundant test of the variable being switched on. This optimisation kicks in for an `among` where all cases have commands. This change seems to speed up `make check_python_arabic` by a few percent. New stemming algorithms ----------------------- * Add Serbian stemmer from stef4np (#113). * Add Yiddish stemmer from Assaf Urieli (#137). * Add Armenian stemmer from Astghik Mkrtchyan. It's been on the website for over a decade, and included in Xapian for over 9 years without any negative feedback. Optimisations to existing algorithms ------------------------------------ * kraaij_pohlmann: Use `$v = limit` instead of `do (tolimit setmark v)` since this generates simpler code, and also matches the code other algorithm implementations use. Probably for languages like C with optimising compilers the compiler will generate equivalent code anyway, but e.g. for Python this should be an improvement. Code clarity improvements to existing algorithms ------------------------------------------------ * hindi.sbl: Fix comment typo. Compiler -------- * Don't count `$x = x + 1` as initialising or using `x`, so it's now handled like `$x += 1` already is. * Comments are now only included in the generated code if command line option -comments is specified. The comments in the generated code are useful if you're trying to debug the compiler, and perhaps also if you are trying to debug your Snowball code, but for everyone else they just bloat the code which as the number of languages we support grows becomes more of an issue. * `-parentclassname` is not only for java and csharp so don't disable it if those backends are disabled. * `-syntax` now reports the value for each numeric literal. * Report location for excessive get nesting error. * Internally the compiler now represents negated literal numbers as a simple `c_number` rather than `c_neg` applied to a `c_number` with a positive value. This simplifies optimisations that want to check for a constant numeric expression. Build system ------------ * Link binaries with LDFLAGS if it's set, which is needed for some platform (e.g. OpenEmbedded). Patch from Andreas Müller (#120). * Add missing dependencies of algorithms.go rule. Testsuite --------- * C: Add stemtest for low-level regression tests. Documentation ------------- * Document a C99 compiler as a requirement for building the snowball compiler (but the C code it generates should still work with any ISO C compiler). A few declarations mixed with code crept in some time ago (which nobody's complained about), so this is really just formally documenting a requirement which already existed. * README: Explain what Snowball is and what Stemming is (#131, reported by Sean Kelly). * CONTRIBUTING.rst: Expand section on adding a new generator. * For Python snowballstemmer module include global NEWS instead of Python-specific CHANGES.rst and use README.rst as the long description. Patch from Dmitry Shachnev (#119). * COPYING: Update and incorporate Python backend licensing information which was previously in a separate file. Snowball 2.0.0 (2019-10-02) =========================== C/C++ ----- * Fully handle 4-byte UTF-8 sequences. Previously `hop` and `next` handled sequences of any length, but commands which look at the character value only handled sequences up to length 3. Fixes #89. * Fix handling of a 3-byte UTF-8 sequence in a grouping in `backwardmode`. Java ---- * TestApp.java: - Always use UTF-8 for I/O. Patch from David Corbett (#80). - Allow reading input from stdin. - Remove rather pointless "stem n times" feature. - Only lower case ASCII to match stemwords.c. - Stem empty lines too to match stemwords.c. Code Quality Improvements ------------------------- * Fix various warnings from newer compilers. * Improve use of `const`. * Share common functions between compiler backends rather than having multiple copies of the same code. * Assorted code clean-up. * Initialise line_labelled member of struct generator to 0. Previously we were invoking undefined behaviour, though in practice it'll be zero initialised on most platforms. New Code Generators ------------------- * Add Python generator (#24). Originally written by Yoshiki Shibukawa, with additional updates by Dmitry Shachnev. * Add Javascript generator. Based on JSX generator (#26) written by Yoshiki Shibukawa. * Add Rust generator from Jakob Demler (#51). * Add Go generator from Marty Schoch (#57). * Add C# generator. Based on patch from Cesar Souza (#16, #17). * Add Pascal generator. Based on Delphi backend from stemming.zip file on old website (#75). New Snowball Language Features ------------------------------ * Add `len` and `lenof` to measure Unicode length. These are similar to `size` and `sizeof` (respectively), but `size` and `sizeof` return the length in bytes under `-utf8`, whereas these new commands give the same result whether using `-utf8`, `-widechars` or neither (but under `-utf8` they are O(n) in the length of the string). For compatibility with existing code which might use these as variable or function names, they stop being treated as tokens if declared to be a variable or function. * New `{U+1234}` stringdef notation for Unicode codepoints. * More versatile integer tests. Now you can compare any two arithmetic expressions with a relational operator in parentheses after the `$`, so for example `$(len > 3)` can now be used when previously a temporary variable was required: `$tmp = len $tmp > 3` Code generation improvements ---------------------------- * General: + Avoid unnecessarily saving and restoring of the cursor for more commands - `atlimit`, `do`, `set` and `unset` all leave the cursor alone or always restore its value, and for C `booltest` (which other languages already handled). + Special case handling for `setlimit tomark AE`. All uses of setlimit in the current stemmers we ship follow this pattern, and by special-casing we can avoid having to save and restore the cursor (#74). + Merge duplicate actions in the same `among`. This reduces the size of the switch/if-chain in the generated code which dispatch the among for many of the stemmers. + Generate simpler code for `among`. We always check for a zero return value when we call the among, so there's no point also checking for that in the switch/if-chain. We can also avoid the switch/if-chain entirely when there's only one possible outcome (besides the zero return). + Optimise code generated for `do `. This speeds up "make check_python" by about 2%, and should speed up other interpreted languages too (#110). + Generate more and better comments referencing snowball source. + Add homepage URL and compiler version as comments in generated files. * C/C++: + Fix `size` and `sizeof` to not report one too high (reported by Assem Chelli in #32). + If signal `f` from a function call would lead to return from the current function then handle this and bailing out on an error together with a simple `if (ret <= 0) return ret;` + Inline testing for a single character literals. + Avoiding generating `|| 0` in corner case - this can result in a compiler warning when building the generated code. + Implement `insert_v()` in terms of `insert_s()`. + Add conditional `extern "C"` so `runtime/api.h` can be included from C++ code. Closes #90, reported by vvarma. * Java: + Fix functions in `among` to work in Java. We seem to need to make the methods called from among `public` instead of `private`, and to call them on `this` instead of the `methodObject` (which is cleaner anyway). No revision in version control seems to generate working code for this case, but Richard says it definitely used to work - possibly older JVMs failed to correctly enforce the access controls when methods were invoked by reflection. + Code after handling `f` by returning from the current function is unreachable too. + Previously we incorrectly decided that code after an `or` was unreachable in certain cases. None of the current stemmers in the distribution triggered this, but Martin Porter's snowball version of the Schinke Latin stemmer does. Fixes #58, reported by Alexander Myltsev. + The reachability logic was failing to consider reachability from the final command in an `or`. Fixes #82, reported by David Corbett. + Fix `maxint` and `minint`. Patch from David Corbett in #31. + Fix `$` on strings. The previous generated code was just wrong. This doesn't affect any of the included algorithms, but for example breaks Martin Porter's snowball implementation of Schinke's Latin Stemmer. Issue noted by Jakob Demler while working on the Rust backend in #51, and reported in the Schinke's Latin Stemmer by Alexander Myltsev in #58. + Make SnowballProgram objects serializable. Patch from Oleg Smirnov in #43. + Eliminate range-check implementation for groupings. This was removed from the C generator 10 years earlier, isn't used for any of the existing algorithms, and it doesn't seem likely it would be - the grouping would have to consist entirely of a contiguous block of Unicode code-points. + Simplify code generated for `repeat` and `atleast`. + Eliminate unused return values and variables from runtime functions. + Only import the `among` and `SnowballProgram` classes if they're actually used. + Only generate `copy_from()` method if it's used. + Merge runtime functions `eq_s` and `eq_v` functions. + Java arrays know their own length so stop storing it separately. + Escape char 127 (DEL) in generated Java code. It's unlikely that this character would actually be used in a real stemmer, so this was more of a theoretical bug. + Drop unused import of InvocationTargetException from SnowballStemmer. Reported by GerritDeMeulder in #72. + Fix lint check issues in generated Java code. The stemmer classes are only referenced in the example app via reflection, so add @SuppressWarnings("unused") for them. The stemmer classes override equals() and hashCode() methods from the standard java Object class, so mark these with @Override. Both suggested by GerritDeMeulder in #72. + Declare Java variables at point of use in generated code. Putting all declarations at the top of the function was adding unnecessary complexity to the Java generator code for no benefit. + Improve formatting of generated code. New stemming algorithms ----------------------- * Add Tamil stemmer from Damodharan Rajalingam (#2, #3). * Add Arabic stemmer from Assem Chelli (#32, #50). * Add Irish stemmer from Jim O'Regan (#48). * Add Nepali stemmer from Arthur Zakirov (#70). * Add Indonesian stemmer from Olly Betts (#71). * Add Hindi stemmer from Olly Betts (#73). Thanks to David Corbett for review. * Add Lithuanian stemmer from Dainius Jocas (#22, #76). * Add Greek stemmer from Oleg Smirnov (#44). * Add Catalan and Basque stemmers from Israel Olalla (#104). Behavioural changes to existing algorithms ------------------------------------------ * Portuguese: + Replace incorrect Spanish suffixes by Portuguese suffixes (#1). * French: + The MSDOS CP850 version of the French algorithm was missing changes present in the ISO8859-1 and Unicode versions. There's now a single version of each algorithm which was based on the Unicode version. + Recognize French suffixes even when they begin with diaereses. Patch from David Corbett in #78. * Russian: + We now normalise 'ё' to 'е' before stemming. The documentation has long said "we assume ['ё'] is mapped into ['е']" but it's more convenient for the stemmer to actually perform this normalisation. This change has no effect if the caller is already normalising as we recommend. It's a change in behaviour they aren't, but 'ё' occurs rarely (there are currently no instances in our test vocabulary) and this improves behaviour when it does occur. Patch from Eugene Mirotin (#65, #68). * Finish: + Adjust the Finnish algorithm not to mangle numbers. This change also means it tends to leave foreign words alone. Fixes #66. * Danish: + Adjust Danish algorithm not to mangle alphanumeric codes. In particular alphanumeric codes ending in a double digit (e.g. 0x0e00, hal9000, space1999) are no longer mangled. See #81. Optimisations to existing algorithms ------------------------------------ * Turkish: + Simplify uses of `test` in stemmer code. + Check for 'ad' or 'soyad' more efficiently, and without needing the strlen variable. This speeds up "make check_utf8_turkish" by 11% on x86 Linux. * Kraaij-Pohlmann: + Eliminate variable x `$p1 <= cursor` is simpler and a little more efficient than `setmark x $x >= p1`. Code clarity improvements to existing algorithms ------------------------------------------------ * Turkish: + Use , for cedilla to match the conventions used in other stemmers. * Kraaij-Pohlmann: + Avoid cryptic `[among ( (])` ... `)` construct - instead use the same `[substring] among (` ... `)` construct we do in other stemmers. Compiler -------- * Support conventional --help and --version options. * Warn if -r or -ep used with backend other than C/C++. * Warn if encoding command line options are specified when generating code in a language with a fixed encoding. * The default classname is now set based on the output filename, so `-n` is now often no longer needed. Fixes #64. * Avoid potential one byte buffer over-read when parsing snowball code. * Avoid comparing with uninitialised array element during compilation. * Improve `-syntax` output for `setlimit L for C`. * Optimise away double negation so generators don't have to worry about generating `--` (decrement operator in many languages). Fixes #52, reported by David Corbett. * Improved compiler error and warning messages: - We now report FILE:LINE: before each diagnostic message. - Improve warnings for unused declarations/definitions. - Warn for variables which are used, but either never initialised or never read. - Flag non-ASCII literal strings. This is an error for wide Unicode, but only a warning for single-byte and UTF-8 which work so long as the source encoding matches the encoding used in the generated stemmer code. - Improve error recovery after an undeclared `define`. We now sniff the token after the identifier and if it is `as` we parse as a routine, otherwise we parse as a grouping. Previously we always just assumed it was a routine, which gave a confusing second error if it was a grouping. - Improve error recovery after an unexpected token in `among`. Previously we acted as if the unexpected token closed the `among` (this probably wasn't intended but just a missing `break;` in a switch statement). Now we issue an error and try the next token. * Report error instead of silently truncating character values (e.g. `hex 123` previously silently became byte 0x23 which is `#` rather than a g-with-cedilla). * Enlarge the initial input buffer size to 8192 bytes and double each time we hit the end. Snowball programs are typically a few KB in size (with the current largest we ship being the Greek stemmer at 27KB) so the previous approach of starting with a 10 byte input buffer and increasing its size by 50% plus 40 bytes each time it filled was inefficient, needing up to 15 reallocations to load greek.sbl. * Identify variables only used by one `routine`/`external`. This information isn't yet used, but such variables which are also always written to before being read can be emitted as local variables in most target languages. * We now allow multiple source files on command line, and allow them to be after (or even interspersed) with options to better match modern Unix conventions. Support for multiple source files allows specifying a single byte character set mapping via a source file of `stringdef`. * Avoid infinite recursion in compiler when optimising a recursive snowball function. Recursive functions aren't typical in snowball programs, but the compiler shouldn't crash for any input, especially not a valid one. We now simply limit on how deep the compiler will recurse and make the pessimistic assumption in the unlikely event we hit this limit. Build system ------------ * `make clean` in C libstemmer_c distribution now removes `examples/*.o`. (#59) * Fix all the places which previously had to have a list of stemmers to work dynamically or be generated, so now only modules.txt needs updating to add a new stemmer. * Add check_java make target which runs tests for java. * Support gzipped test data (the uncompressed arabic test data is too big for github). * GNUmakefile: Drop useless `-eprefix` and `-r` options from snowball invocations for Java - these are only meaningful when generating C code. * Pass CFLAGS when linking which matches convention (e.g. automake does it) and facilitates use of tools such as ASan. Fixes #84, reported by Thomas Pointhuber. * Add CI builds with -std=c90 to check compiler and generated code are C90 (#54) libstemmer ---------- * Split out CPPFLAGS from CFLAGS and use CFLAGS when linking stemwords. * Add -O2 to CFLAGS. * Make generated tables of encodings and modules const. * Fix clang static analyzer memory leak warning (in practice this code path can never actually be taken). Patch from Patrick O. Perry (#56) Documentation ------------- * Added copyright and licensing details (#10). * Document that libstemmer supports ISO_8859_2 encoding. Currently hungarian and romanian are available in ISO_8859_2. * Remove documentation falsely claiming that libstemmer supports CP850 encoding. * CONTRIBUTING.rst: Add guidance for contributing new stemming algorithms and new language backends. * Overhaul libstemmer_python_README. Most notably, replace the benchmark data which was very out of date. snowball-3.1.0/README.rst000066400000000000000000000067561520373054300150430ustar00rootroot00000000000000Snowball is a small string processing language for creating stemming algorithms for use in Information Retrieval, plus a collection of stemming algorithms implemented using it. Snowball was originally designed and built by Martin Porter. Martin retired from development in 2014 and Snowball is now maintained as a community project. Martin originally chose the name Snowball as a tribute to SNOBOL, the excellent string handling language from the 1960s. It now also serves as a metaphor for how the project grows by gathering contributions over time. The Snowball compiler translates a Snowball program into source code in another language - currently Ada, ISO C, C#, Dart, Go, Java, Javascript, Object Pascal, PHP, Python, Rust and Zig are supported. This repository contains the source code for the snowball compiler and the stemming algorithms. The snowball compiler is written in ISO C - you'll need a C compiler which support C99 to build it (but the C code it generates should work with any ISO C compiler). See https://snowballstem.org/ for more information about Snowball. What is Stemming? ================= Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. Building Snowball ================= GNU make is required to build Snowball. The build system is currently structured as two separate stages for many of the target languages. The first stage builds the Snowball compiler and runs it to create target language code (and it can also run tests on each stemmer). The expectation is that you then create "distribution" tarballs of this code with ``make dist`` (or create one for a specific target language, e.g. with ``make dist_libstemmer_c`` for C). These tarballs are created in the ``dist/`` subdirectory. To actually build the libstemmer library you then unpack and build the distribution tarball, e.g. for C:: tar xf dist/libstemmer_c-3.1.0.tar.gz cd libstemmer_c-3.1.0 make Cross-compiling --------------- If cross-compiling starting from the git repo, the Snowball compiler needs to be built with a native compiler then libstemmer with the cross-compiler. For example:: make CC=cc dist_libstemmer_c tar xf dist/libstemmer_c-3.1.0.tar.gz cd libstemmer_c-3.1.0 make CC=riscv64-unknown-linux-gnu-gcc If you are cross-compiling to or from Microsoft Windows, you'll need to also work around an assumption in libstemmer's ``Makefile`` which sets ``EXEEXT`` based on the OS you are building on:: ifeq ($(OS),Windows_NT) EXEEXT=.exe endif For example, if cross-compiling from Linux to Microsoft Windows, use something like this for the libstemmer build:: make CC=x86_64-w64-mingw32-gcc EXEEXT=.exe When going the other way, you'll need to use ``EXEEXT=``. snowball-3.1.0/ada/000077500000000000000000000000001520373054300140635ustar00rootroot00000000000000snowball-3.1.0/ada/README.md000066400000000000000000000041711520373054300153450ustar00rootroot00000000000000# Ada Target for Snowball The Ada Snowball generator generates an Ada child package for each Snowball algorithm. The parent package is named `Stemmer` and it provides various operations used by the generated code. The `Stemmer` package contains the Ada Snowball runtime available either in `ada/src` directory or from https://github.com/stcarrez/ada-stemmer. The generated child package declares the `Context_Type` tagged type and the `Stem` procedure: ```Ada package Stemmer. is type Context_Type is new Stemmer.Context_Type with private; procedure Stem (Z : in out Context_Type; Result : out Boolean); private type Context_Type is new Stemmer.Context_Type with record ... end record; end Stemmer.; ``` It is possible to use directly the generated operation or use it through the `Stemmer.Factory` package. ## Usage To generate Ada source for a Snowball algorithm: ``` $ snowball path/to/algorithm.sbl -ada -P -o src/stemmer- ``` ### Ada specific options `-P ` the child package name used in the generated Ada file (defaults to `snowball`). It must be a valid Ada identifier. ## Code Organization `compiler/generator_ada.c` has the Ada code generation logic `ada/src` contains the default Ada Snowball runtime support which is also available at https://github.com/stcarrez/ada-stemmer `ada/algorithms` location where the makefile generated code will end up ## Using the Generated Stemmers To use the generated stemmer, import the Ada generated package, declare an instance of the generated `Context_Type` and call the `Stem_Word` procedure. ``` with Stemmer.English; Ctx : Stemmer.English.Context_Type; Result : Boolean; Ctx.Stem_Word ("zealously", Result); if Result then Ada.Text_IO.Put_Line (Ctx.Get_Result); end if; ``` You can use the context as many times as you want. ## Testing To run the tests, you will need an Ada compiler such as GNAT as well as the `gprbuild` build tool. Only the existing Snowball algorithms have been used for testing. This does not exercise all features of the language. Run: ``` $ make check_ada ``` snowball-3.1.0/ada/generate.gpr000066400000000000000000000007441520373054300163740ustar00rootroot00000000000000with "stemmer_config"; project Generate is Mains := ("generate.adb"); for Main use Mains; for Source_Dirs use ("generate"); for Object_Dir use "./" & Stemmer_Config'Object_Dir & "/obj"; for Exec_Dir use "./" & Stemmer_Config'Exec_Dir & "/bin"; package Binder renames Stemmer_Config.Binder; package Builder renames Stemmer_Config.Builder; package Compiler renames Stemmer_Config.Compiler; package Linker renames Stemmer_Config.Linker; end Generate; snowball-3.1.0/ada/generate/000077500000000000000000000000001520373054300156555ustar00rootroot00000000000000snowball-3.1.0/ada/generate/generate.adb000066400000000000000000000054601520373054300201240ustar00rootroot00000000000000with Ada.Characters.Handling; with Ada.Text_IO; with Ada.Command_Line; with Ada.Containers.Indefinite_Vectors; procedure Generate is use Ada.Characters.Handling; use Ada.Text_IO; package String_Vectors is new Ada.Containers.Indefinite_Vectors (Element_Type => String, Index_Type => Positive); Languages : String_Vectors.Vector; function Capitalize (S : in String) return String is (To_Upper (S (S'First)) & S (S'First + 1 .. S'Last)); procedure Write_Spec is File : File_Type; I : Natural := 0; begin Create (File, Out_File, "stemmer-factory.ads"); Put_Line (File, "package Stemmer.Factory with SPARK_Mode is"); New_Line (File); Put (File, " type Language_Type is ("); for Lang of Languages loop if I > 0 then Put_Line (File, ","); Put (File, " "); end if; Put (File, "L_" & To_Upper (Lang)); I := I + 1; end loop; Put_Line (File, ");"); New_Line (File); Put_Line (File, " function Stem (Language : in Language_Type;"); Put_Line (File, " Word : in String) return String;"); New_Line (File); Put_Line (File, "end Stemmer.Factory;"); Close (File); end Write_Spec; procedure Write_Body is File : File_Type; begin Create (File, Out_File, "stemmer-factory.adb"); for Lang of Languages loop Put_Line (File, "with Stemmer.S_" & Capitalize (Lang) & ";"); end loop; Put_Line (File, "package body Stemmer.Factory with SPARK_Mode is"); New_Line (File); Put_Line (File, " function Stem (Language : in Language_Type;"); Put_Line (File, " Word : in String) return String is"); Put_Line (File, " Result : Boolean := False;"); Put_Line (File, " begin"); Put_Line (File, " case Language is"); for Lang of Languages loop Put_Line (File, " when L_" & To_Upper (Lang) & " =>"); Put_Line (File, " declare"); Put_Line (File, " C : Stemmer.S_" & Capitalize (Lang) & ".Context_Type;"); Put_Line (File, " begin"); Put_Line (File, " C.Stem_Word (Word, Result);"); Put_Line (File, " return Get_Result (C);"); Put_Line (File, " end;"); New_Line (File); end loop; Put_Line (File, " end case;"); Put_Line (File, " end Stem;"); New_Line (File); Put_Line (File, "end Stemmer.Factory;"); Close (File); end Write_Body; Count : constant Natural := Ada.Command_Line.Argument_Count; begin for I in 1 .. Count loop Languages.Append (To_Lower (Ada.Command_Line.Argument (I))); end loop; Write_Spec; Write_Body; end Generate; snowball-3.1.0/ada/src/000077500000000000000000000000001520373054300146525ustar00rootroot00000000000000snowball-3.1.0/ada/src/stemmer.adb000066400000000000000000000467571520373054300170210ustar00rootroot00000000000000----------------------------------------------------------------------- -- stemmer -- Multi-language stemmer with Snowball generator -- Written by Stephane Carrez (Stephane.Carrez@gmail.com) -- All rights reserved. -- -- Redistribution and use in source and binary forms, with or without -- modification, are permitted provided that the following conditions -- are met: -- -- 1. Redistributions of source code must retain the above copyright notice, -- this list of conditions and the following disclaimer. -- 2. Redistributions in binary form must reproduce the above copyright notice, -- this list of conditions and the following disclaimer in the documentation -- and/or other materials provided with the distribution. -- 3. Neither the name of the Snowball project nor the names of its contributors -- may be used to endorse or promote products derived from this software -- without specific prior written permission. -- -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------- with Interfaces; package body Stemmer with SPARK_Mode is subtype Byte is Interfaces.Unsigned_8; use type Interfaces.Unsigned_8; procedure Stem_Word (Z : in out Context_Type'Class; Word : in String; Result : out Boolean) is begin Z.P (1 .. Word'Length) := Word; Z.Len := Word'Length; Z.C := 0; Z.L := Word'Length; Z.Lb := 0; Stemmer.Stem (Z, Result); end Stem_Word; function Get_Result (Z : in Context_Type'Class) return String is begin return Z.P (1 .. Z.Len); end Get_Result; procedure Eq_S (Z : in out Context_Type'Class; S : in String; Len : in Char_Index; Result : out Boolean) is begin if Z.L - Z.C < Len then Result := False; return; end if; if Z.P (Z.C + 1 .. Z.C + Len) /= S (S'First .. Len) then Result := False; return; end if; Z.C := Z.C + Len; Result := True; end Eq_S; procedure Eq_S_Backward (Z : in out Context_Type'Class; S : in String; Len : in Char_Index; Result : out Boolean) is begin if Z.C - Z.Lb < Len then Result := False; return; end if; if Z.P (Z.C + 1 - Len .. Z.C) /= S (S'First .. Len) then Result := False; return; end if; Z.C := Z.C - Len; Result := True; end Eq_S_Backward; function Length_Utf8 (S : in String; Len : in Char_Index) return Natural is Count : Natural := 0; Pos : Positive := 1; Val : Byte; begin while Pos <= Len loop Val := Character'Pos (S (Pos)); Pos := Pos + 1; if Val >= 16#C0# or Val < 16#80# then Count := Count + 1; end if; end loop; return Count; end Length_Utf8; function Check_Among (Z : in Context_Type'Class; Pos : in Char_Index; Shift : in Natural; Mask : in Mask_Type) return Boolean is use Interfaces; Val : constant Byte := Character'Pos (Z.P (Pos + 1)); begin if Natural (Shift_Right (Val, 5)) /= Shift then return True; end if; return (Shift_Right (Unsigned_64 (Mask), Natural (Val and 16#1f#)) and 1) = 0; end Check_Among; procedure Find_Among (Z : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) is I : Natural := Amongs'First; J : Natural := Amongs'Last + 1; Common_I : Natural := 0; Common_J : Natural := 0; First_Key_Inspected : Boolean := False; C : constant Natural := Z.C; L : constant Integer := Z.L; begin loop declare K : constant Natural := I + (J - I) / 2; W : constant Among_Type := Amongs (K); Common : Natural := (if Common_I < Common_J then Common_I else Common_J); Diff : Integer := 0; begin for I2 in W.First + Common .. W.Last loop if C + Common = L then Diff := -1; exit; end if; Diff := Character'Pos (Z.P (C + Common + 1)) - Character'Pos (Pattern (I2)); exit when Diff /= 0; Common := Common + 1; end loop; if Diff < 0 then J := K; Common_J := Common; else I := K; Common_I := Common; end if; end; if J - I <= 1 then exit when I > 0 or J = I or First_Key_Inspected; First_Key_Inspected := True; end if; end loop; loop declare W : constant Among_Type := Amongs (I); Len : constant Natural := W.Last - W.First + 1; Status : Boolean; begin if Common_I >= Len then Z.C := C + Len; if W.Operation = 0 then Result := W.Result; return; end if; Execute (Z, W.Operation, Status); if Status then Z.C := C + Len; Result := W.Result; return; end if; end if; exit when W.Substring_I < 0; I := W.Substring_I; end; end loop; Result := 0; end Find_Among; procedure Find_Among_Backward (Z : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) is I : Natural := Amongs'First; J : Natural := Amongs'Last + 1; Common_I : Natural := 0; Common_J : Natural := 0; First_Key_Inspected : Boolean := False; C : constant Integer := Z.C; Lb : constant Integer := Z.Lb; begin loop declare K : constant Natural := I + (J - I) / 2; W : constant Among_Type := Amongs (K); Common : Natural := (if Common_I < Common_J then Common_I else Common_J); Diff : Integer := 0; begin for I2 in reverse W.First .. W.Last - Common loop if C - Common = Lb then Diff := -1; exit; end if; Diff := Character'Pos (Z.P (C - Common)) - Character'Pos (Pattern (I2)); exit when Diff /= 0; Common := Common + 1; end loop; if Diff < 0 then J := K; Common_J := Common; else I := K; Common_I := Common; end if; end; if J - I <= 1 then exit when I > 0 or J = I or First_Key_Inspected; First_Key_Inspected := True; end if; end loop; loop declare W : constant Among_Type := Amongs (I); Len : constant Natural := W.Last - W.First + 1; Status : Boolean; begin if Common_I >= Len then Z.C := C - Len; if W.Operation = 0 then Result := W.Result; return; end if; Execute (Z, W.Operation, Status); if Status then Z.C := C - Len; Result := W.Result; return; end if; end if; exit when W.Substring_I < 0; I := W.Substring_I; end; end loop; Result := 0; end Find_Among_Backward; function Skip_Utf8 (Z : in Context_Type'Class) return Result_Index is Pos : Char_Index := Z.C; Val : Byte; begin if Pos >= Z.L then return -1; end if; Pos := Pos + 1; Val := Character'Pos (Z.P (Pos)); if Val >= 16#C0# then while Pos < Z.L loop Val := Character'Pos (Z.P (Pos + 1)); exit when Val >= 16#C0# or Val < 16#80#; Pos := Pos + 1; end loop; end if; return Pos; end Skip_Utf8; function Skip_Utf8 (Z : in Context_Type'Class; N : in Integer) return Result_Index is Pos : Char_Index := Z.C; Val : Byte; begin if N < 0 then return -1; end if; for I in 1 .. N loop if Pos >= Z.L then return -1; end if; Pos := Pos + 1; Val := Character'Pos (Z.P (Pos)); if Val >= 16#C0# then while Pos < Z.L loop Val := Character'Pos (Z.P (Pos + 1)); exit when Val >= 16#C0# or Val < 16#80#; Pos := Pos + 1; end loop; end if; end loop; return Pos; end Skip_Utf8; function Skip_Utf8_Backward (Z : in Context_Type'Class) return Result_Index is Pos : Char_Index := Z.C; Val : Byte; begin if Pos <= Z.Lb then return -1; end if; Val := Character'Pos (Z.P (Pos)); Pos := Pos - 1; if Val >= 16#80# then while Pos > Z.Lb loop Val := Character'Pos (Z.P (Pos + 1)); exit when Val >= 16#C0#; Pos := Pos - 1; end loop; end if; return Pos; end Skip_Utf8_Backward; function Skip_Utf8_Backward (Z : in Context_Type'Class; N : in Integer) return Result_Index is Pos : Char_Index := Z.C; Val : Byte; begin if N < 0 then return -1; end if; for I in 1 .. N loop if Pos <= Z.Lb then return -1; end if; Val := Character'Pos (Z.P (Pos)); Pos := Pos - 1; if Val >= 16#80# then while Pos > Z.Lb loop Val := Character'Pos (Z.P (Pos + 1)); exit when Val >= 16#C0#; Pos := Pos - 1; end loop; end if; end loop; return Pos; end Skip_Utf8_Backward; function Shift_Left (Value : in Utf8_Type; Shift : in Natural) return Utf8_Type is (Utf8_Type (Interfaces.Shift_Left (Interfaces.Unsigned_32 (Value), Shift))); procedure Get_Utf8 (Z : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural) is B0, B1, B2, B3 : Byte; begin if Z.C >= Z.L then Value := 0; Count := 0; return; end if; B0 := Character'Pos (Z.P (Z.C + 1)); if B0 < 16#C0# or Z.C + 1 >= Z.L then Value := Utf8_Type (B0); Count := 1; return; end if; B1 := Character'Pos (Z.P (Z.C + 2)) and 16#3F#; if B0 < 16#E0# or Z.C + 2 >= Z.L then Value := Shift_Left (Utf8_Type (B0 and 16#1F#), 6) or Utf8_Type (B1); Count := 2; return; end if; B2 := Character'Pos (Z.P (Z.C + 3)) and 16#3F#; if B0 < 16#F0# or Z.C + 3 >= Z.L then Value := Shift_Left (Utf8_Type (B0 and 16#0F#), 12) or Shift_Left (Utf8_Type (B1), 6) or Utf8_Type (B2); Count := 3; return; end if; B3 := Character'Pos (Z.P (Z.C + 4)) and 16#3F#; Value := Shift_Left (Utf8_Type (B0 and 16#07#), 18) or Shift_Left (Utf8_Type (B1), 12) or Shift_Left (Utf8_Type (B2), 6) or Utf8_Type (B3); Count := 4; end Get_Utf8; procedure Get_Utf8_Backward (Z : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural) is B0, B1, B2, B3 : Byte; begin if Z.C <= Z.Lb then Value := 0; Count := 0; return; end if; B3 := Character'Pos (Z.P (Z.C)); if B3 < 16#80# or Z.C - 1 <= Z.Lb then Value := Utf8_Type (B3); Count := 1; return; end if; B2 := Character'Pos (Z.P (Z.C - 1)); if B2 >= 16#C0# or Z.C - 2 <= Z.Lb then B3 := B3 and 16#3F#; Value := Shift_Left (Utf8_Type (B2 and 16#1F#), 6) or Utf8_Type (B3); Count := 2; return; end if; B1 := Character'Pos (Z.P (Z.C - 2)); if B1 >= 16#E0# or Z.C - 3 <= Z.Lb then B3 := B3 and 16#3F#; B2 := B2 and 16#3F#; Value := Shift_Left (Utf8_Type (B1 and 16#0F#), 12) or Shift_Left (Utf8_Type (B2), 6) or Utf8_Type (B3); Count := 3; return; end if; B0 := Character'Pos (Z.P (Z.C - 3)); B1 := B1 and 16#1F#; B2 := B2 and 16#3F#; B3 := B3 and 16#3F#; Value := Shift_Left (Utf8_Type (B0 and 16#07#), 18) or Shift_Left (Utf8_Type (B1), 12) or Shift_Left (Utf8_Type (B2), 6) or Utf8_Type (B3); Count := 4; end Get_Utf8_Backward; procedure Out_Grouping (Z : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Z.C >= Z.L then Result := -1; return; end if; loop Get_Utf8 (Z, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch <= Max and Ch >= Min then Ch := Ch - Min; if S (Ch) then Result := Count; return; end if; end if; Z.C := Z.C + Count; exit when not Repeat; end loop; Result := 0; end Out_Grouping; procedure Out_Grouping_Backward (Z : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Z.C <= Z.Lb then Result := -1; return; end if; loop Get_Utf8_Backward (Z, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch <= Max and Ch >= Min then Ch := Ch - Min; if S (Ch) then Result := Count; return; end if; end if; Z.C := Z.C - Count; exit when not Repeat; end loop; Result := 0; end Out_Grouping_Backward; procedure In_Grouping (Z : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Z.C >= Z.L then Result := -1; return; end if; loop Get_Utf8 (Z, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch > Max or Ch < Min then Result := Count; return; end if; Ch := Ch - Min; if not S (Ch) then Result := Count; return; end if; Z.C := Z.C + Count; exit when not Repeat; end loop; Result := 0; end In_Grouping; procedure In_Grouping_Backward (Z : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Z.C <= Z.Lb then Result := -1; return; end if; loop Get_Utf8_Backward (Z, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch > Max or Ch < Min then Result := Count; return; end if; Ch := Ch - Min; if not S (Ch) then Result := Count; return; end if; Z.C := Z.C - Count; exit when not Repeat; end loop; Result := 0; end In_Grouping_Backward; procedure Replace (Z : in out Context_Type'Class; C_Bra : in Char_Index; C_Ket : in Char_Index; S : in String; Len : in Char_Index) is Adjustment : Integer; begin Adjustment := Len - (C_Ket - C_Bra); if Adjustment /= 0 then Z.P (C_Ket + Adjustment + 1 .. Z.Len + Adjustment) := Z.P (C_Ket + 1 .. Z.Len); Z.Len := Z.Len + Adjustment; Z.L := Z.L + Adjustment; if Z.C >= C_Ket then Z.C := Z.C + Adjustment; elsif Z.C > C_Bra then Z.C := C_Bra; end if; end if; if Len > 0 then Z.P (C_Bra + 1 .. C_Bra + Len) := S (S'First .. Len); end if; end Replace; procedure Slice_Del (Z : in out Context_Type'Class) is begin Replace (Z, Z.Bra, Z.Ket, "", 0); Z.Ket := Z.Bra; end Slice_Del; procedure Slice_From (Z : in out Context_Type'Class; Text : in String; Len : in Char_Index) is begin Replace (Z, Z.Bra, Z.Ket, Text, Len); Z.Ket := Z.Bra + Len; end Slice_From; procedure Insert (Z : in out Context_Type'Class; S : in String; Len : in Char_Index) is C : Char_Index; begin C := Z.C; Replace (Z, Z.C, Z.C, S, Len); if C <= Z.Bra then Z.Bra := Z.Bra + Len; end if; if C <= Z.Ket then Z.Ket := Z.Ket + Len; end if; end Insert; end Stemmer; snowball-3.1.0/ada/src/stemmer.ads000066400000000000000000000222101520373054300170140ustar00rootroot00000000000000----------------------------------------------------------------------- -- stemmer -- Multi-language stemmer with Snowball generator -- Written by Stephane Carrez (Stephane.Carrez@gmail.com) -- All rights reserved. -- -- Redistribution and use in source and binary forms, with or without -- modification, are permitted provided that the following conditions -- are met: -- -- 1. Redistributions of source code must retain the above copyright notice, -- this list of conditions and the following disclaimer. -- 2. Redistributions in binary form must reproduce the above copyright notice, -- this list of conditions and the following disclaimer in the documentation -- and/or other materials provided with the distribution. -- 3. Neither the name of the Snowball project nor the names of its contributors -- may be used to endorse or promote products derived from this software -- without specific prior written permission. -- -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------- package Stemmer with SPARK_Mode is pragma Preelaborate; WORD_MAX_LENGTH : constant := 1024; type Context_Type is abstract tagged private; -- Apply the stemming algorithm on the word initialized in the context. procedure Stem (Z : in out Context_Type; Result : out Boolean) is abstract; -- Stem the word and return True if it was reduced. procedure Stem_Word (Z : in out Context_Type'Class; Word : in String; Result : out Boolean) with Global => null, Pre => Word'Length < WORD_MAX_LENGTH; -- Get the stem or the input word unmodified. function Get_Result (Z : in Context_Type'Class) return String with Global => null, Post => Get_Result'Result'Length < WORD_MAX_LENGTH; private type Mask_Type is mod 2**32; -- A 32-bit character value that was read from UTF-8 sequence. -- A modular value is used because shift and logical arithmetic is necessary. type Utf8_Type is mod 2**32; -- Index of the Grouping_Array. The index comes from the 32-bit character value -- minus a starting offset. We don't expect large tables and we check against -- a maximum value. subtype Grouping_Index is Utf8_Type range 0 .. 16384; type Grouping_Array is array (Grouping_Index range <>) of Boolean with Pack; subtype Among_Index is Natural range 0 .. 65535; subtype Among_Start_Index is Among_Index range 1 .. Among_Index'Last; subtype Operation_Index is Natural range 0 .. 65535; subtype Result_Index is Integer range -1 .. WORD_MAX_LENGTH - 1; subtype Char_Index is Result_Index range 0 .. Result_Index'Last; type Among_Type is record First : Among_Start_Index; Last : Among_Index; Substring_I : Integer; Result : Integer; Operation : Operation_Index; end record; type Among_Array_Type is array (Natural range <>) of Among_Type; procedure Eq_S (Z : in out Context_Type'Class; S : in String; Len : in Char_Index; Result : out Boolean) with Global => null; procedure Eq_S_Backward (Z : in out Context_Type'Class; S : in String; Len : in Char_Index; Result : out Boolean) with Global => null; procedure Find_Among (Z : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) with Global => null, Pre => Pattern'Length > 0 and Amongs'Length > 0; procedure Find_Among_Backward (Z : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) with Global => null, Pre => Pattern'Length > 0 and Amongs'Length > 0; function Skip_Utf8 (Z : in Context_Type'Class) return Result_Index with Global => null; function Skip_Utf8 (Z : in Context_Type'Class; N : in Integer) return Result_Index with Global => null; function Skip_Utf8_Backward (Z : in Context_Type'Class) return Result_Index with Global => null; function Skip_Utf8_Backward (Z : in Context_Type'Class; N : in Integer) return Result_Index with Global => null; procedure Get_Utf8 (Z : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural); procedure Get_Utf8_Backward (Z : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural); function Length_Utf8 (S : in String; Len : in Char_Index) return Natural; function Check_Among (Z : in Context_Type'Class; Pos : in Char_Index; Shift : in Natural; Mask : in Mask_Type) return Boolean; procedure Out_Grouping (Z : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure Out_Grouping_Backward (Z : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure In_Grouping (Z : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure In_Grouping_Backward (Z : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure Replace (Z : in out Context_Type'Class; C_Bra : in Char_Index; C_Ket : in Char_Index; S : in String; Len : in Char_Index) with Global => null, Pre => C_Bra <= C_Ket and C_Ket <= Z.Len and Z.Len + Len - (C_Ket - C_Bra) < Z.P'Length; procedure Slice_Del (Z : in out Context_Type'Class) with Global => null, Pre => Z.Bra <= Z.Ket and Z.Ket <= Z.Len; procedure Slice_From (Z : in out Context_Type'Class; Text : in String; Len : in Char_Index) with Global => null, Pre => Z.Bra <= Z.Ket and Z.Ket <= Z.Len; procedure Insert (Z : in out Context_Type'Class; S : in String; Len : in Char_Index) with Global => null, Pre => Z.C <= Z.Len and Z.Len + Len < Z.P'Length; -- The context indexes follow the C paradigm: they start at 0 for the first character. -- This is necessary because several algorithms rely on this when they compare the -- cursor position ('C') or setup some markers from the cursor. type Context_Type is abstract tagged record C : Char_Index := 0; Len : Char_Index := 0; L : Char_Index := 0; Lb : Char_Index := 0; Bra : Char_Index := 0; Ket : Char_Index := 0; P : String (1 .. WORD_MAX_LENGTH); end record; end Stemmer; snowball-3.1.0/ada/src/stemwords.adb000066400000000000000000000054531520373054300173600ustar00rootroot00000000000000with Ada.Text_IO; with Ada.Command_Line; with Stemmer.Factory; procedure Stemwords is use Stemmer.Factory; function Get_Language (Name : in String; Error: out Boolean) return Language_Type; function Is_Space (C : in Character) return Boolean; procedure Show_Usage; function Is_Space (C : in Character) return Boolean is begin return C = ' ' or C = ASCII.HT; end Is_Space; function Get_Language (Name : in String; Error: out Boolean) return Language_Type is begin Error := False; return Language_Type'Value ("L_" & Name); exception when Constraint_Error => Ada.Text_IO.Put_Line ("Unsupported language: " & Name); Error := True; return Language_Type'Val (0); end Get_Language; procedure Show_Usage is begin Ada.Text_IO.Put_Line ("Usage: stemwords "); Ada.Command_Line.Set_Exit_Status(Ada.Command_Line.Failure); end Show_Usage; Count : constant Natural := Ada.Command_Line.Argument_Count; begin if Count /= 3 then Show_Usage; return; end if; declare Bad_Usage : Boolean := False; Lang : constant Language_Type := Get_Language (Ada.Command_Line.Argument (1), Bad_Usage); Input : constant String := Ada.Command_Line.Argument (2); Output : constant String := Ada.Command_Line.Argument (3); Src_File : Ada.Text_IO.File_Type; Dst_File : Ada.Text_IO.File_Type; begin if Bad_Usage then Show_Usage; return; end if; Ada.Text_IO.Open (Src_File, Ada.Text_IO.In_File, Input); Ada.Text_IO.Create (Dst_File, Ada.Text_IO.Out_File, Output); while not Ada.Text_IO.End_Of_File (Src_File) loop declare Line : constant String := Ada.Text_IO.Get_Line (Src_File); Pos : Positive := Line'First; Last_Pos : Positive; Start_Pos : Positive; begin while Pos <= Line'Last loop Last_Pos := Pos; while Pos <= Line'Last and then Is_Space (Line (Pos)) loop Pos := Pos + 1; end loop; if Last_Pos < Pos then Ada.Text_IO.Put (Dst_File, Line (Last_Pos .. Pos - 1)); end if; exit when Pos > Line'Last; Start_Pos := Pos; while Pos <= Line'Last and then not Is_Space (Line (Pos)) loop Pos := Pos + 1; end loop; Ada.Text_IO.Put (Dst_File, Stemmer.Factory.Stem (Lang, Line (Start_Pos .. Pos - 1))); end loop; Ada.Text_IO.New_Line (Dst_File); end; end loop; Ada.Text_IO.Close (Src_File); Ada.Text_IO.Close (Dst_File); end; end Stemwords; snowball-3.1.0/ada/stemmer_config.gpr000066400000000000000000000044371520373054300176060ustar00rootroot00000000000000abstract project Stemmer_Config is for Source_Dirs use (); type Yes_No is ("yes", "no"); type Library_Type_Type is ("relocatable", "static", "static-pic"); type Build_Type is ("distrib", "debug", "optimize", "profile", "coverage"); Mode : Build_Type := external ("BUILD", "distrib"); Processors := External ("PROCESSORS", "1"); package Builder is case Mode is when "debug" => for Default_Switches ("Ada") use ("-g", "-j" & Processors); when others => for Default_Switches ("Ada") use ("-g", "-O3", "-j" & Processors); end case; end Builder; package compiler is warnings := ("-gnatwua"); defaults := ("-gnat2012"); case Mode is when "distrib" => for Default_Switches ("Ada") use defaults & ("-gnatafno", "-gnatVa", "-gnatwa"); when "debug" => for Default_Switches ("Ada") use defaults & warnings & ("-gnata", "-gnatVaMI", "-gnaty3abcefhiklmnprstxM99"); when "coverage" => for Default_Switches ("Ada") use defaults & warnings & ("-gnata", "-gnatVaMI", "-gnaty3abcefhiklmnprstxM99", "-fprofile-arcs", "-ftest-coverage"); when "optimize" => for Default_Switches ("Ada") use defaults & warnings & ("-gnatn", "-gnatp", "-fdata-sections", "-ffunction-sections"); when "profile" => for Default_Switches ("Ada") use defaults & warnings & ("-pg"); end case; end compiler; package binder is case Mode is when "debug" => for Default_Switches ("Ada") use ("-E"); when others => for Default_Switches ("Ada") use ("-E"); end case; end binder; package linker is case Mode is when "profile" => for Default_Switches ("Ada") use ("-pg"); when "distrib" => for Default_Switches ("Ada") use ("-s"); when "optimize" => for Default_Switches ("Ada") use ("-Wl,--gc-sections"); when "coverage" => for Default_Switches ("ada") use ("-fprofile-arcs"); when others => null; end case; end linker; package Ide is for VCS_Kind use "git"; end Ide; end Stemmer_Config; snowball-3.1.0/ada/stemwords.gpr000066400000000000000000000007601520373054300166270ustar00rootroot00000000000000with "stemmer_config"; project Stemwords is Mains := ("stemwords.adb"); for Main use Mains; for Source_Dirs use ("src", "algorithms"); for Object_Dir use "./" & Stemmer_Config'Object_Dir & "/obj"; for Exec_Dir use "./" & Stemmer_Config'Exec_Dir & "/bin"; package Binder renames Stemmer_Config.Binder; package Builder renames Stemmer_Config.Builder; package Compiler renames Stemmer_Config.Compiler; package Linker renames Stemmer_Config.Linker; end Stemwords; snowball-3.1.0/algorithms/000077500000000000000000000000001520373054300155075ustar00rootroot00000000000000snowball-3.1.0/algorithms/arabic.sbl000066400000000000000000000422551520373054300174420ustar00rootroot00000000000000/* * Authors: * - Assem Chelli, < assem [dot] ch [at] gmail > * - Abdelkrim Aries * */ stringescapes { } /* the Arabic letters in Unicode */ // Hamza stringdef o '{U+0621}' // Hamza stringdef ao '{U+0623}' // Hamza above Alef stringdef ao_ '{U+0625}' // Hamza below Alef stringdef a~ '{U+0622}' // Alef madda stringdef wo '{U+0624}' // Hamza above waw stringdef yo '{U+0626}' // Hamza above yeh // Letters stringdef a '{U+0627}' // Alef stringdef a_ '{U+0649}' // Alef Maksura stringdef b '{U+0628}' // Beh stringdef t_ '{U+0629}' // Teh_Marbuta stringdef t '{U+062A}' // Teh stringdef th '{U+062B}' // Theh stringdef j '{U+062C}' // Jeem stringdef h '{U+062D}' // Hah stringdef x '{U+062E}' // Khah stringdef d '{U+062F}' // Dal stringdef dz '{U+0630}' // Thal stringdef r '{U+0631}' // Reh stringdef z '{U+0632}' // Zain stringdef s '{U+0633}' // Seen stringdef sh '{U+0634}' // Sheen stringdef c '{U+0635}' // Sad stringdef dh '{U+0636}' // Dad stringdef tt '{U+0637}' // Tah stringdef zh '{U+0638}' // Zah stringdef i '{U+0639}' // Ain stringdef gh '{U+063A}' // Ghain stringdef f '{U+0641}' // Feh stringdef q '{U+0642}' // Qaf stringdef k '{U+0643}' // Kaf stringdef l '{U+0644}' // Lam stringdef m '{U+0645}' // Meem stringdef n '{U+0646}' // Noon stringdef e '{U+0647}' // Heh stringdef w '{U+0648}' // Waw stringdef y '{U+064A}' // Yeh // Diacritics stringdef aan '{U+064B}' // FatHatan stringdef uun '{U+064C}' // Dammatan stringdef iin '{U+064D}' // Kasratan stringdef aa '{U+064E}' // FatHa stringdef uu '{U+064F}' // Damma stringdef ii '{U+0650}' // Kasra stringdef oo '{U+0652}' // Sukun stringdef ~ '{U+0651}' // Shadda // Hindu–Arabic numerals stringdef 0 '{U+0660}' stringdef 1 '{U+0661}' stringdef 2 '{U+0662}' stringdef 3 '{U+0663}' stringdef 4 '{U+0664}' stringdef 5 '{U+0665}' stringdef 6 '{U+0666}' stringdef 7 '{U+0667}' stringdef 8 '{U+0668}' stringdef 9 '{U+0669}' // Kasheeda stringdef _ '{U+0640}' // Kasheeda, Tatweel // Shaped forms stringdef o1 '{U+FE80}' // HAMZA stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW stringdef yo1 '{U+FE8B}' // YEH_HAMZA stringdef yo2 '{U+FE8C}' // YEH_HAMZA stringdef yo3 '{U+FE89}' // YEH_HAMZA stringdef yo4 '{U+FE8A}' // YEH_HAMZA stringdef a~1 '{U+FE81}' // ALEF_MADDA stringdef a~2 '{U+FE82}' // ALEF_MADDA stringdef wo1 '{U+FE85}' // WAW_HAMZA stringdef wo2 '{U+FE86}' // WAW_HAMZA stringdef a1 '{U+FE8D}' // ALEF stringdef a2 '{U+FE8E}' // ALEF stringdef b1 '{U+FE8F}' // BEH stringdef b2 '{U+FE90}' // BEH stringdef b3 '{U+FE91}' // BEH stringdef b4 '{U+FE92}' // BEH stringdef t_1 '{U+FE93}' // TEH_MARBUTA stringdef t_2 '{U+FE94}' // TEH_MARBUTA stringdef t1 '{U+FE97}' // TEH stringdef t2 '{U+FE98}' // TEH stringdef t3 '{U+FE95}' // TEH stringdef t4 '{U+FE96}' // TEH stringdef th1 '{U+FE9B}' // THEH stringdef th2 '{U+FE9C}' // THEH stringdef th3 '{U+FE9A}' // THEH stringdef th4 '{U+FE99}' // THEH stringdef j1 '{U+FE9F}' // JEEM stringdef j2 '{U+FEA0}' // JEEM stringdef j3 '{U+FE9D}' // JEEM stringdef j4 '{U+FE9E}' // JEEM stringdef h1 '{U+FEA3}' // HAH stringdef h2 '{U+FEA4}' // HAH stringdef h3 '{U+FEA1}' // HAH stringdef h4 '{U+FEA2}' // HAH stringdef x1 '{U+FEA7}' // KHAH stringdef x2 '{U+FEA8}' // KHAH stringdef x3 '{U+FEA5}' // KHAH stringdef x4 '{U+FEA6}' // KHAH stringdef d1 '{U+FEA9}' // DAL stringdef d2 '{U+FEAA}' // DAL stringdef dz1 '{U+FEAB}' // THAL stringdef dz2 '{U+FEAC}' // THAL stringdef r1 '{U+FEAD}' // REH stringdef r2 '{U+FEAE}' // REH stringdef z1 '{U+FEAF}' // ZAIN stringdef z2 '{U+FEB0}' // ZAIN stringdef s1 '{U+FEB3}' // SEEN stringdef s2 '{U+FEB4}' // SEEN stringdef s3 '{U+FEB1}' // SEEN stringdef s4 '{U+FEB2}' // SEEN stringdef sh1 '{U+FEB7}' // SHEEN stringdef sh2 '{U+FEB8}' // SHEEN stringdef sh3 '{U+FEB5}' // SHEEN stringdef sh4 '{U+FEB6}' // SHEEN stringdef c1 '{U+FEBB}' // SAD stringdef c2 '{U+FEBC}' // SAD stringdef c3 '{U+FEB9}' // SAD stringdef c4 '{U+FEBA}' // SAD stringdef dh1 '{U+FEBF}' // DAD stringdef dh2 '{U+FEC0}' // DAD stringdef dh3 '{U+FEBD}' // DAD stringdef dh4 '{U+FEBE}' // DAD stringdef tt1 '{U+FEC3}' // TAH stringdef tt2 '{U+FEC4}' // TAH stringdef tt3 '{U+FEC1}' // TAH stringdef tt4 '{U+FEC2}' // TAH stringdef zh1 '{U+FEC7}' // ZAH stringdef zh2 '{U+FEC8}' // ZAH stringdef zh3 '{U+FEC5}' // ZAH stringdef zh4 '{U+FEC6}' // ZAH stringdef i1 '{U+FECB}' // AIN stringdef i2 '{U+FECC}' // AIN stringdef i3 '{U+FEC9}' // AIN stringdef i4 '{U+FECA}' // AIN stringdef gh1 '{U+FECF}' // GHAIN stringdef gh2 '{U+FED0}' // GHAIN stringdef gh3 '{U+FECD}' // GHAIN stringdef gh4 '{U+FECE}' // GHAIN stringdef f1 '{U+FED3}' // FEH stringdef f2 '{U+FED4}' // FEH stringdef f3 '{U+FED1}' // FEH stringdef f4 '{U+FED2}' // FEH stringdef q1 '{U+FED7}' // QAF stringdef q2 '{U+FED8}' // QAF stringdef q3 '{U+FED5}' // QAF stringdef q4 '{U+FED6}' // QAF stringdef k1 '{U+FEDB}' // KAF stringdef k2 '{U+FEDC}' // KAF stringdef k3 '{U+FED9}' // KAF stringdef k4 '{U+FEDA}' // KAF stringdef l1 '{U+FEDF}' // LAM stringdef l2 '{U+FEE0}' // LAM stringdef l3 '{U+FEDD}' // LAM stringdef l4 '{U+FEDE}' // LAM stringdef m1 '{U+FEE3}' // MEEM stringdef m2 '{U+FEE4}' // MEEM stringdef m3 '{U+FEE1}' // MEEM stringdef m4 '{U+FEE2}' // MEEM stringdef n1 '{U+FEE7}' // NOON stringdef n2 '{U+FEE8}' // NOON stringdef n3 '{U+FEE5}' // NOON stringdef n4 '{U+FEE6}' // NOON stringdef e1 '{U+FEEB}' // HEH stringdef e2 '{U+FEEC}' // HEH stringdef e3 '{U+FEE9}' // HEH stringdef e4 '{U+FEEA}' // HEH stringdef w1 '{U+FEED}' // WAW stringdef w2 '{U+FEEE}' // WAW stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA stringdef y1 '{U+FEF3}' // YEH stringdef y2 '{U+FEF4}' // YEH stringdef y3 '{U+FEF1}' // YEH stringdef y4 '{U+FEF2}' // YEH // Ligatures Lam-Alef stringdef la '{U+FEFB}' // LAM_ALEF stringdef la2 '{U+FEFC}' // LAM_ALEF stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE booleans ( is_noun is_verb is_defined ) routines ( Prefix_Step1 Prefix_Step2 Prefix_Step3a_Noun Prefix_Step3b_Noun Prefix_Step3_Verb Prefix_Step4_Verb Suffix_All_alef_maqsura Suffix_Noun_Step1a Suffix_Noun_Step1b Suffix_Noun_Step2a Suffix_Noun_Step2b Suffix_Noun_Step2c1 Suffix_Noun_Step2c2 Suffix_Noun_Step3 Suffix_Verb_Step1 Suffix_Verb_Step2a Suffix_Verb_Step2b Suffix_Verb_Step2c Normalize_post Normalize_pre Checks1 ) externals ( stem ) groupings ( ) // Normalizations define Normalize_pre as ( do repeat ( ( [substring] among ( '{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization '{_}' ( delete ) // strip kasheeda // Hindu–Arabic numerals '{0}' ( <- '0') '{1}' ( <- '1') '{2}' ( <- '2') '{3}' ( <- '3') '{4}' ( <- '4') '{5}' ( <- '5') '{6}' ( <- '6') '{7}' ( <- '7') '{8}' ( <- '8') '{9}' ( <- '9') // Shaped forms '{o1}' ( <- '{o}' ) // HAMZA '{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE '{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW '{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA '{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA '{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA '{a1}' '{a2}' ( <- '{a}' ) // ALEF '{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH '{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA '{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH '{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH '{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM '{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH '{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH '{d1}' '{d2}' ( <- '{d}' ) // DAL '{dz1}''{dz2}' ( <- '{dz}' ) // THAL '{r1}' '{r2}'( <- '{r}' ) // REH '{z1}' '{z2}' ( <- '{z}' ) // ZAIN '{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN '{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN '{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD '{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD '{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH '{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH '{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN '{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN '{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH '{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF '{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF '{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM '{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM '{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON '{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH '{w1}' '{w2}' ( <- '{w}' ) // WAW '{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA '{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH // Ligatures Lam-Alef '{la}' '{la2}' (<- '{l}{a}') '{lao}' '{lao2}' (<- '{l}{ao}') '{lao_}' '{lao_2}' (<- '{l}{ao_}') '{la~}' '{la~2}' (<- '{l}{a~}') ) ) or next ) ) define Normalize_post as ( do ( // normalize last hamza backwards ( [substring] among ( '{ao}''{ao_}' '{a~}' ( <- '{o}') '{wo}' ( <- '{o}') '{yo}' ( <- '{o}') ) ) ) do repeat ( ( // normalize other hamza's [substring] among ( '{ao}''{ao_}' '{a~}' ( <- '{a}') '{wo}' ( <- '{w}') '{yo}' ( <- '{y}') ) ) or next ) ) // Checks define Checks1 as ( [substring] among ( '{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined) '{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined) ) ) //prefixes define Prefix_Step1 as ( [substring] among ( '{ao}{ao}' ($(len > 3) <- '{ao}' ) '{ao}{a~}' ($(len > 3) <- '{a~}' ) '{ao}{wo}' ($(len > 3) <- '{ao}' ) '{ao}{a}' ($(len > 3) <- '{a}' ) '{ao}{ao_}' ($(len > 3) <- '{ao_}' ) // '{ao}' ($(len > 3) delete) //rare case ) ) define Prefix_Step2 as ( [substring] among ( '{f}' '{w}' ($(len > 3) not '{a}' delete) ) ) define Prefix_Step3a_Noun as ( // it is noun and defined [substring] among ( '{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete) '{l}{l}' '{a}{l}' ($(len > 4) delete) ) ) define Prefix_Step3b_Noun as ( // probably noun and defined [substring] among ( '{b}{a}' ( ) // exception - not a valid verb prefix so can just succeed here '{b}' ($(len > 3) delete) // '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion '{b}{b}' ($(len > 3) <- '{b}' ) '{k}{k}' ($(len > 3) <- '{k}' ) ) ) define Prefix_Step3_Verb as ( [substring] among ( //'{s}' ($(len > 4) delete)// BUG: cause confusion '{s}{y}' ($(len > 4) <- '{y}' ) '{s}{t}' ($(len > 4) <- '{t}') '{s}{n}' ($(len > 4) <- '{n}') '{s}{ao}' ($(len > 4) <- '{ao}') ) ) define Prefix_Step4_Verb as ( [substring] among ( '{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' ) ) ) // suffixes backwardmode ( define Suffix_Noun_Step1a as ( [substring] among ( '{y}' '{k}' '{e}' ($(len >= 4) delete) '{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete) '{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete) ) ) define Suffix_Noun_Step1b as ( [substring] among ( '{n}' ($(len > 5) delete) ) ) define Suffix_Noun_Step2a as ( [substring] among ( '{a}' '{y}' '{w}' ($(len > 4) delete) ) ) define Suffix_Noun_Step2b as ( [substring] among ( '{a}{t}' ($(len >= 5) delete) ) ) define Suffix_Noun_Step2c1 as ( [substring] among ( '{t}' ($(len >= 4) delete) ) ) define Suffix_Noun_Step2c2 as ( // feminine t_ [substring] among ( '{t_}' ($(len >= 4) delete) ) ) define Suffix_Noun_Step3 as ( // ya' nisbiya [substring] among ( '{y}' ($(len >= 3) delete) ) ) define Suffix_Verb_Step1 as ( [substring] among ( '{e}' '{k}' ($(len >= 4) delete) '{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete) '{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete) ) ) define Suffix_Verb_Step2a as ( [substring] among ( '{t}' ($(len >= 4) delete) '{a}' '{n}' '{y}' ($(len >= 4) delete) '{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past '{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present '{t}{m}{a}' ($(len >= 6) delete) ) ) define Suffix_Verb_Step2b as ( [substring] among ( '{w}{a}' '{t}{m}' ($(len >= 5) delete) ) ) define Suffix_Verb_Step2c as ( [substring] among ( '{w}' ($(len >= 4) delete) '{t}{m}{w}' ($(len >= 6) delete) ) ) define Suffix_All_alef_maqsura as ( [substring] among ( '{a_}' ( <- '{y}' ) // spell error // '{a_}' ( delete ) // if noun > 3 // '{a_}' ( <- '{a}') // if verb ) ) ) define stem as ( // set initial values set is_noun set is_verb unset is_defined // guess type and properties do Checks1 // normalization pre-stemming do Normalize_pre backwards ( do ( //Suffixes for verbs ( is_verb ( ( (atleast 1 Suffix_Verb_Step1) ( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next) ) or Suffix_Verb_Step2b or Suffix_Verb_Step2a ) ) //Suffixes for nouns or ( is_noun ( try ( Suffix_Noun_Step2c2 or (not is_defined Suffix_Noun_Step1a ( Suffix_Noun_Step2a or Suffix_Noun_Step2b or Suffix_Noun_Step2c1 or next)) or (Suffix_Noun_Step1b ( Suffix_Noun_Step2a or Suffix_Noun_Step2b or Suffix_Noun_Step2c1)) or (not is_defined Suffix_Noun_Step2a) or (Suffix_Noun_Step2b) ) Suffix_Noun_Step3 ) ) // Suffixes for alef maqsura or Suffix_All_alef_maqsura ) ) //Prefixes do ( try Prefix_Step1 try Prefix_Step2 ( Prefix_Step3a_Noun or (is_noun Prefix_Step3b_Noun) or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb) ) ) // normalization post-stemming do Normalize_post ) snowball-3.1.0/algorithms/armenian.sbl000066400000000000000000000162551520373054300200140ustar00rootroot00000000000000stringescapes {} stringdef a '{U+0561}' // 531 stringdef b '{U+0562}' // 532 stringdef g '{U+0563}' // 533 stringdef d '{U+0564}' // 534 stringdef ye '{U+0565}' // 535 stringdef z '{U+0566}' // 536 stringdef e '{U+0567}' // 537 stringdef y '{U+0568}' // 538 stringdef dt '{U+0569}' // 539 stringdef zh '{U+056A}' // 53A stringdef i '{U+056B}' // 53B stringdef l '{U+056C}' // 53C stringdef kh '{U+056D}' // 53D stringdef ts '{U+056E}' // 53E stringdef k '{U+056F}' // 53F stringdef h '{U+0570}' // 540 stringdef dz '{U+0571}' // 541 stringdef gh '{U+0572}' // 542 stringdef djch '{U+0573}' // 543 stringdef m '{U+0574}' // 544 stringdef j '{U+0575}' // 545 stringdef n '{U+0576}' // 546 stringdef sh '{U+0577}' // 547 stringdef vo '{U+0578}' // 548 stringdef ch '{U+0579}' // 549 stringdef p '{U+057A}' // 54A stringdef dj '{U+057B}' // 54B stringdef r '{U+057C}' // 54C stringdef s '{U+057D}' // 54D stringdef v '{U+057E}' // 54E stringdef t '{U+057F}' // 54F stringdef r' '{U+0580}' // 550 stringdef c '{U+0581}' // 551 stringdef u '{U+0582}' // 552 //vjun stringdef bp '{U+0583}' // 553 stringdef q '{U+0584}' // 554 stringdef ev '{U+0587}' stringdef o '{U+0585}' // 555 stringdef f '{U+0586}' // 556 routines ( mark_regions R2 adjective verb noun ending ) externals ( stem ) integers ( pV p2 ) groupings ( v ) define v '{a}{e}{i}{o}{u}{ye}{vo}{y}' define mark_regions as ( $pV = limit $p2 = limit do ( gopast v setmark pV gopast non-v gopast v gopast non-v setmark p2 ) ) backwardmode ( define R2 as $p2 <= cursor define adjective as ( [substring] among ( '{b}{a}{r'}' '{p}{ye}{s}' '{vo}{r'}{e}{n}' '{vo}{v}{i}{n}' '{a}{k}{i}' '{l}{a}{j}{n}' '{r'}{vo}{r'}{d}' '{ye}{r'}{vo}{r'}{d}' '{a}{k}{a}{n}' '{a}{l}{i}' '{k}{vo}{t}' '{ye}{k}{ye}{n}' '{vo}{r'}{a}{k}' '{ye}{gh}' '{v}{vo}{u}{n}' '{ye}{r'}{ye}{n}' '{a}{r'}{a}{n}' '{ye}{n}' '{a}{v}{ye}{t}' '{g}{i}{n}' '{i}{v}' '{a}{t}' '{i}{n}' (delete) ) ) define verb as ( [substring] among ( '{vo}{u}{m}' '{v}{vo}{u}{m}' '{a}{l}{vo}{u}' '{ye}{l}{vo}{u}' '{v}{ye}{l}' '{a}{n}{a}{l}' '{ye}{l}{vo}{u}{c}' '{a}{l}{vo}{u}{c}' '{y}{a}{l}' '{y}{ye}{l}' '{a}{l}{vo}{v}' '{ye}{l}{vo}{v}' '{a}{l}{i}{s}' '{ye}{l}{i}{s}' '{ye}{n}{a}{l}' '{a}{c}{n}{a}{l}' '{ye}{c}{n}{ye}{l}' '{c}{n}{ye}{l}' '{n}{ye}{l}' '{a}{t}{ye}{l}' '{vo}{t}{ye}{l}' '{k}{vo}{t}{ye}{l}' '{t}{ye}{l}' '{v}{a}{ts}' '{ye}{c}{v}{ye}{l}' '{a}{c}{v}{ye}{l}' '{ye}{c}{i}{r'}' '{a}{c}{i}{r'}' '{ye}{c}{i}{n}{q}' '{a}{c}{i}{n}{q}' '{v}{ye}{c}{i}{r'}' '{v}{ye}{c}{i}{n}{q}' '{v}{ye}{c}{i}{q}' '{v}{ye}{c}{i}{n}' '{a}{c}{r'}{i}{r'}' '{a}{c}{r'}{ye}{c}' '{a}{c}{r'}{i}{n}{q}' '{a}{c}{r'}{i}{q}' '{a}{c}{r'}{i}{n}' '{ye}{c}{i}{q}' '{a}{c}{i}{q}' '{ye}{c}{i}{n}' '{a}{c}{i}{n}' '{a}{c}{a}{r'}' '{a}{c}{a}{v}' '{a}{c}{a}{n}{q}' '{a}{c}{a}{q}' '{a}{c}{a}{n}' '{v}{ye}{c}{i}' '{a}{c}{r'}{i}' '{ye}{c}{a}{r'}' '{ye}{c}{a}{v}' '{c}{a}{n}{q}' '{c}{a}{q}' '{c}{a}{n}' '{a}{c}{a}' '{a}{c}{i}' '{ye}{c}{a}' '{ch}{ye}{l}' '{ye}{c}{i}' '{a}{r'}' '{a}{v}' '{a}{n}{q}' '{a}{q}' '{a}{n}' '{a}{l}' '{ye}{l}' '{ye}{c}' '{a}{c}' '{v}{ye}' '{a}' (delete) ) ) define noun as ( [substring] among ( '{a}{ts}{vo}' '{a}{n}{a}{k}' '{a}{n}{o}{c}' '{a}{r'}{a}{n}' '{a}{r'}{q}' '{p}{a}{n}' '{s}{t}{a}{n}' '{ye}{gh}{e}{n}' '{ye}{n}{q}' '{i}{k}' '{i}{ch}' '{i}{q}' '{m}{vo}{u}{n}{q}' '{j}{a}{k}' '{j}{vo}{u}{n}' '{vo}{n}{q}' '{vo}{r'}{d}' '{vo}{c}' '{ch}{ye}{q}' '{v}{a}{ts}{q}' '{v}{vo}{r'}' '{a}{v}{vo}{r'}' '{vo}{u}{dt}{j}{vo}{u}{n}' '{vo}{u}{k}' '{vo}{u}{h}{i}' '{vo}{u}{j}{dt}' '{vo}{u}{j}{q}' '{vo}{u}{s}{t}' '{vo}{u}{s}' '{c}{i}' '{a}{l}{i}{q}' '{a}{n}{i}{q}' '{i}{l}' '{i}{ch}{q}' '{vo}{u}{n}{q}' '{g}{a}{r'}' '{vo}{u}' '{a}{k}' '{a}{n}' '{q}' (delete) ) ) define ending as ( [substring] R2 among ( '{n}{ye}{r'}{y}' '{n}{ye}{r'}{n}' '{n}{ye}{r'}{i}' '{n}{ye}{r'}{d}' '{ye}{r'}{i}{c}' '{n}{ye}{r'}{i}{c}' '{ye}{r'}{i}' '{ye}{r'}{d}' '{ye}{r'}{n}' '{ye}{r'}{y}' '{n}{ye}{r'}{i}{n}' '{vo}{u}{dt}{j}{a}{n}{n}' '{vo}{u}{dt}{j}{a}{n}{y}' '{vo}{u}{dt}{j}{a}{n}{s}' '{vo}{u}{dt}{j}{a}{n}{d}' '{vo}{u}{dt}{j}{a}{n}' '{ye}{r'}{i}{n}' '{i}{n}' '{s}{a}' '{vo}{dj}' '{i}{c}' '{ye}{r'}{vo}{v}' '{n}{ye}{r'}{vo}{v}' '{ye}{r'}{vo}{u}{m}' '{n}{ye}{r'}{vo}{u}{m}' '{vo}{u}{n}' '{vo}{u}{d}' '{v}{a}{n}{s}' '{v}{a}{n}{y}' '{v}{a}{n}{d}' '{a}{n}{y}' '{a}{n}{d}' '{v}{a}{n}' '{vo}{dj}{y}' '{vo}{dj}{s}' '{vo}{dj}{d}' '{vo}{c}' '{vo}{u}{c}' '{vo}{dj}{i}{c}' '{c}{i}{c}' '{v}{i}{c}' '{v}{i}' '{v}{vo}{v}' '{vo}{v}' '{a}{n}{vo}{v}' '{a}{n}{vo}{u}{m}' '{v}{a}{n}{i}{c}' '{a}{m}{b}' '{a}{n}' '{n}{ye}{r'}' '{ye}{r'}' '{v}{a}' '{y}' '{n}' '{d}' '{c}' '{i}' (delete) ) ) ) define stem as ( do mark_regions backwards setlimit tomark pV for ( do ending do verb do adjective do noun ) ) snowball-3.1.0/algorithms/basque.sbl000066400000000000000000000114621520373054300174750ustar00rootroot00000000000000routines ( aditzak izenak adjetiboak mark_regions RV R2 R1 ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef n~ '{U+00F1}' define v 'aeiou' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) backwardmode ( define RV as $pV <= cursor define R2 as $p2 <= cursor define R1 as $p1 <= cursor define aditzak as ( [substring] among( 'le' 'la' 'tzaile' 'aldatu' 'atu' 'tzailea' 'taile' 'tailea' 'pera' 'gale' 'galea' 'gura' 'kura' 'kor' 'korra' 'or' 'orra' 'tun' 'tuna' 'gaitz' 'gaitza' 'kaitz' 'kaitza' 'ezin' 'ezina' 'tezin' 'tezina' 'errez' 'erreza' 'karri' 'karria' 'tzaga' 'tzaka' 'tzake' 'tzeke' 'ez' 'eza' 'tzez' 'keta' 'eta' 'etan' 'pen' 'pena' 'tze' 'atze' 'kuntza' 'kunde' 'kundea' 'kune' 'kunea' 'kuna' 'kera' 'era' 'kizun' 'kizuna' 'dura' 'tura' 'men' 'mena' 'go' 'ago' 'tio' 'taldi' 'taldia' 'aldi' 'aldia' 'gune' 'gunea' 'bide' 'bidea' 'pide' 'pidea' 'gai' 'gaia' 'ki' 'kin' 'rekin' 'kina' 'kari' 'karia' 'ari' 'tari' 'etari' 'gailu' 'gailua' 'kide' 'kidea' 'ide' 'idea' 'du' 'ka' 'kan' 'an' 'ean' 'tu' 'lari' 'tatu' 'rean' 'tarazi' 'arazi' 'tzat' 'bera' 'dako' ( RV delete ) 'garri' 'garria' 'tza' (R2 delete) 'atseden' 'arabera' 'baditu' ( ) ) ) define izenak as ( [substring] among( 'ari' 'aria' 'bizia' 'kari' 'karia' 'lari' 'laria' 'tari' 'taria' 'zain' 'zaina' 'tzain' 'tzaina' 'zale' 'zalea' 'tzale' 'tzalea' 'aizun' 'orde' 'ordea' 'burua' 'ohi' 'ohia' 'kintza' 'gintzo' 'gintzu' 'tzu' 'tzua' 'tzo' 'tzoa' 'kuntza' 'talde' 'taldea' 'eria' 'keria' 'teria' 'di' 'za' 'ada' 'tara' 'etara' 'tra' 'ta' 'tegi' 'tegia' 'keta' 'z' 'zko' 'zkoa' 'ti' 'tia' 'tsu' 'tsua' 'zu' 'zua' 'bera' 'pera' 'zto' 'ztoa' 'asi' 'asia' 'gile' 'gilea' 'estu' 'estua' 'larri' 'larria' 'nahi' 'nahia' 'koi' 'koia' 'oi' 'oia' 'goi' 'min' 'mina' 'dun' 'duna' 'duru' 'durua' 'duri' 'duria' 'os' 'osa' 'oso' 'osoa' 'ar' 'ara' 'tar' 'dar' 'dara' 'tiar' 'tiara' 'liar' 'liara' 'gabe' 'gabea' 'kabe' 'kabea' 'ga' 'ge' 'kada' 'tasun' 'tasuna' 'asun' 'asuna' 'go' 'mendu' 'mendua' 'mentu' 'mentua' 'mendi' 'mendia' 'zio' 'zioa' 'zino' 'zinoa' 'zione' 'zionea' 'ezia' 'degi' 'degia' 'egi' 'egia' 'toki' 'tokia' 'leku' 'lekua' 'gintza' 'alde' 'aldea' 'kalde' 'kaldea' 'gune' 'gunea' 'une' 'unea' 'una' 'pe' 'pea' 'gibel' 'gibela' 'ondo' 'ondoa' 'arte' 'artea' 'aurre' 'aurrea' 'etxe' 'etxea' 'ola' 'ontzi' 'ontzia' 'gela' 'denda' 'taldi' 'taldia' 'aldi' 'aldia' 'te' 'tea' 'zaro' 'zaroa' 'taro' 'taroa' 'oro' 'oroa' 'aro' 'aroa' 'ero' 'eroa' 'eroz' 'eroza' 'ka' 'kan' 'kana' 'tako' 'etako' 'takoa' 'kote' 'kotea' 'tzar' 'tzarra' 'handi' 'handia' 'kondo' 'kondoa' 'skila' 'no' 'noa' '{n~}o' '{n~}oa' 'ska' 'xka' 'zka' 'tila' 'to' 'toa' 'tto' 'ttoa' 'txo' 'txoa' 'txu' 'txua' 'anda' 'anga' 'urren' 'urrena' 'gai' 'gaia' 'gei' 'geia' 'eme' 'emea' 'kume' 'kumea' 'sa' 'ko' 'eko' 'koa' 'ena' 'enea' 'ne' 'nea' 'kor' 'korra' 'ez' 'eza' 'eta' 'etan' 'ki' 'kia' 'kin' 'kina' 'tu' 'tua' 'du' 'dua' 'ek' 'tarik' 'tariko' 'tan' 'ordu' 'ordua' 'oste' 'ostea' 'tzara' 'ra' 'antza' 'behar' 'ro' 'giro' 'ak' 'zp' 'ket' 'kail' 'kaila' 'ail' 'kirri' 'kirria' 'ngo' 'ngoa' '{n~}i' 'sko' 'sta' 'koitz' 'koitza' 'na' 'garren' 'garrena' 'kera' 'gerren' 'gerrena' 'garna' 'kide' 'tz' 'tuko' ( RV delete ) 'ora' 'garri' 'garria' 'or' 'buru' 'ren' 'tza' ( R2 delete ) 'joka' (<- 'jok') 'tzen' 'ten' 'en' 'tatu' (R1 delete) 'trako' (<- 'tra') 'minutuko' (<- 'minutu') 'zehar' 'geldi' 'igaro' 'aurka' ( ) ) ) define adjetiboak as ( [substring] among( 'era' 'ero' 'go' 'tate' 'tade' 'date' 'dade' 'keria' 'ki' 'to' 'ro' 'la' 'gi' 'larik' 'lanik' 'ik' 'ztik' 'rik' ( RV delete ) 'zlea' (<- 'z') ) ) ) define stem as ( do mark_regions backwards ( repeat aditzak repeat izenak do adjetiboak ) ) snowball-3.1.0/algorithms/catalan.sbl000066400000000000000000000165571520373054300176320ustar00rootroot00000000000000routines ( cleaning mark_regions R1 R2 attached_pronoun standard_suffix verb_suffix residual_suffix ) externals ( stem ) integers ( p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' // a-acute stringdef a` '{U+00E0}' // a-grave stringdef cc '{U+00E7}' // c-cedilla stringdef e' '{U+00E9}' // e-acute stringdef e` '{U+00E8}' // e-grave stringdef i' '{U+00ED}' // i-acute stringdef i` '{U+00EC}' // i-grave stringdef i" '{U+00EF}' // i-diaeresis stringdef o' '{U+00F3}' // o-acute stringdef o` '{U+00F2}' // o-grave stringdef u' '{U+00FA}' // u-acute stringdef u" '{U+00FC}' // u-diaeresis stringdef . '{U+00B7}' // - per l aggeminades define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}' define mark_regions as ( $p1 = limit $p2 = limit // defaults do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define cleaning as repeat ( [substring] among( '{a'}' (<- 'a') '{a`}' (<- 'a') '{e'}' (<- 'e') '{e`}' (<- 'e') '{i'}' (<- 'i') '{i`}' (<- 'i') '{o'}' (<- 'o') '{o`}' (<- 'o') '{u'}' (<- 'u') '{u"}' (<- 'u') '{i"}' (<- 'i') '{.}' (<- '.') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among ( '{'}s' '{'}hi' '{'}ho' '{'}l' '{'}ls' '-ls' '-la' '-les' '-li' 'vos' 'se' 'nos' '-nos' '-us' 'us' '{'}n' '{'}ns' '-n' '-ns' '{'}m' '-me' '-m' '-te' '{'}t' 'li' 'lo' 'los' 'me' 'sela' 'selo' 'selas' 'selos' 'le' 'la' 'las' 'les' 'ens' 'ho' 'hi' (R1 delete) ) ) define standard_suffix as ( [substring] among( 'ar' 'atge' 'formes' 'icte' 'ictes' 'ell' 'ells' 'ella' '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta' 'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls' 'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius' 'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste' 'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis' '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{cc}a' 'nces' '{o'}' 'dor' 'all' 'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu' '{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar' 'itar' 'ables' 'adors' 'idores' 'idors' 'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es' 'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris' 'ats' 'ions' 'ota' 'isam' 'ors' 'ora' 'ores' 'isament' 'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes' 'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies' '{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles' 'assa' 'asses' 'assos' 'ent' 'ents' '{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin' 'ims' 'ima' 'imes' 'isme' 'ista' 'ismes' 'istes' 'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius' 'oses' 'osos' 'ient' 'otes' 'ots' (R1 delete) 'acions' 'ada' 'ades' (R2 delete) 'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques' (R2 <- 'log') 'ic' 'ica' 'ics' 'iques' (R2 <- 'ic') 'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima' (R1 <- 'c') ) ) define verb_suffix as ( [substring] among( 'ador' 'adora' 'adors' 'adores' 're' 'ie' 'ent' 'ents' 'udes' 'ar{a`}' 'eren' 'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' 'aria' 'arian' 'arien' 'aries' 'ar{a`}s' 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara' 'ar{e'}' 'ar{e'}s' 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' 'er{e'}' 'er' 'erau' 'erass' 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' 'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu' 'ia' 'ies' '{i'}em' '{i`}eu' 'ien' 'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats' 'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu' 'essen' 'esses' 'assen' 'asses' 'assim' 'assiu' '{e'}ssen' '{e'}sseu' '{e'}ssim' '{e'}ssiu' '{e'}ssem' '{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren' 'ar{i'}em' 'ar{i'}eu' 'areu' 'aren' 'ant' '{i"}m' '{i"}u' '{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es' 'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da' 'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its' 'id' 'ids' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' 'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as' 'ieu' 'ii' 'io' 'i{a`}' 'ess' 'essin' 'essis' 'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu' 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques' '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' 'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien' 'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu' 'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis' 'eixen' 'eixo' 'isin' 'isis' 'esques' 'sis' 'sin' 'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen' 'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim' '{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu' '{i"}ra' '{i"}ren' '{i"}res' '{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x' 'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis' 'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s' (R1 delete) 'ando' (R2 delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu' 'is' 'i' 'ir' 's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it' (R1 delete) 'iqu' (R1 <- 'ic') ) ) ) define stem as ( do mark_regions backwards ( do attached_pronoun do ( standard_suffix or verb_suffix ) do residual_suffix ) do cleaning ) snowball-3.1.0/algorithms/czech.sbl000066400000000000000000000157201520373054300173120ustar00rootroot00000000000000routines ( R1 palatalise_e palatalise_i mark_regions possessive_suffix case_suffix ) externals ( stem ) integers ( p1 x ) groupings ( env_ending ev_ending v v_or_syllabic_c ) stringescapes {} stringdef a' '{U+00E1}' // á stringdef cv '{U+010D}' // č stringdef dv '{U+010F}' // ď stringdef e' '{U+00E9}' // é stringdef ev '{U+011B}' // ě stringdef i' '{U+00ED}' // í stringdef nv '{U+0148}' // ň stringdef o' '{U+00F3}' // ó stringdef rv '{U+0159}' // ř stringdef sv '{U+0161}' // š stringdef tv '{U+0165}' // ť stringdef u' '{U+00FA}' // ú stringdef uo '{U+016F}' // ů stringdef y' '{U+00FD}' // ý stringdef zv '{U+017E}' // ž define v 'aeiouy{a'}{ev}{e'}{i'}{o'}{u'}{uo}{y'}' // Some consonants in Czech can be syllabic - if these occur between two other // consonants then they act in a vowel-like way and it is helpful to include // them in the definition of R1. // // Some sources also list 'm' and 'n' as syllabic consonants for Czech but they // seem to be much rarer and including them makes no difference to the results // of stemming any words in our sample vocabulary list. Checking on a larger // vocabulary list (also from wikipedia but with a lower cut-off frequency) // all but one of the affected words don't seem to actually be Czech words. define v_or_syllabic_c v + 'lr' // Letters that can occur before -ev. Actual known exceptions include // 'j' (objev, projev, výjev) and 'ř' (ohřev) define ev_ending 'hknrtz' // Letters that can occur before -eň. Actual known exceptions include // 'g' (Irgeň), 'l' (zeleň), 'm' (kameň) and 'ř' (třeň). define env_ending 'bc{cv}dhkprs{sv}tvz{zv}' define mark_regions as ( test (hop 3 setmark x) // Signals f if the input < 3 characters. $p1 = limit do ( // A syllabic consonant must occur between two consonants, or be // preceded by a consonant and at the end of the word. // // Instead of literally testing that, we handle the first character // specially by only checking if it's a vowel; for subsequent // characters we know that the character before is a consonant because // otherwise we'd have stopped already. // // We also don't actually need to check the character after, since // if it's a vowel then that vowel means we'd end up at the same // position after `gopast non-v` anyway, and if it's the end of the // word then there's no non-v after it. (v or (next gopast v_or_syllabic_c)) gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 ) ) backwardmode ( define R1 as $p1 <= cursor define palatalise_e as ( [substring] among ( // -c -> -k 'c' (<- 'k') 'nc' // e.g. finance 'avc' // e.g. dravce 'ovc' // e.g. jalovce () '{i'}nc' // e.g. podmínce (<- '{i'}nk') ) ) define palatalise_i as ( [substring] among ( // -c -> -k 'c' (<- 'k') 'nc' // e.g. financí 'avc' // e.g. nástavci 'ovc' // e.g. pískovci () '{i'}nc' // e.g. Gruzínci (<- '{i'}nk') '{cv}t' (<- 'ck') // -št -> -sk '{sv}t' // e.g. čeština (<-'sk') '{a'}{sv}t' // e.g. plášti 'de{sv}t' // e.g. dešti 'i{sv}t' // e.g. bojišti '{i'}{sv}t' // e.g. příští 'le{sv}t' // e.g. kleští 'pou{sv}t' // e.g. poušti, spouští () ) ) define possessive_suffix as ( [substring] R1 among ( 'ov' '{uo}v' (delete) 'in' ( delete try palatalise_i ) ) ) define case_suffix as ( setlimit tomark p1 for ( [substring] ) among ( 'atech' 'at{uo}m' '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi' 'ata' 'aty' 'ama' 'ami' 'ovi' 'at' '{a'}m' 'us' '{uo}m' '{y'}m' 'mi' 'ou' '{e'}ho' '{e'}m' '{e'}mu' 'u' 'y' '{uo}' 'a' 'o' '{a'}' '{e'}' '{y'}' (delete) '{ev}' '{ev}tem' '{ev}mi' '{ev}te' '{ev}ti' '{ev}m' // e.g. koněm ( delete ) 'e' 'ech' 'em' 'emi' ( delete try palatalise_e ) 'ete' 'eti' 'etem' ( // t-stem neuter nouns among ( '{cv}' // e.g. dvojč-etem 'l' // e.g. batol-ete '{rv}' // e.g. zvíř-ete 's' // e.g. pras-ete '{zv}' // e.g. páž-ete (delete) 'e{cv}' // e.g. pečet-i 'tl' // e.g. atlet-i 'es' // e.g. deset-i '' ( // Remove -e, -i, or -em; stem now ends -et so no palatalise // step is needed. <-'et' ) ) ) 'eb' ( // Conflate e.g. skladeb with skladba, skladbě, skladby, etc. test non-v not 't{rv}' // potřeb <-'b' ) 'ec' ( // Conflate e.g. obec with obce, obcemi, obci, obcí, obcích. test non-v delete attach 'c' try palatalise_e ) 'ek' ( // Conflate e.g. článek with článkem, článku, článků, článkům, články. test non-v not among ( 'dot' // dotek 'obl' // oblek 'sn' // česnek ) <-'k' ) '{ev}k' ( // Conflate e.g. daněk with daňka, daňkem, daňki, daňkovi, daňků, etc. 'n'] <-'{nv}k' ) 'e{nv}' ( // Conflate e.g. Plzeň with Plzně, Plzni, Plzní. test env_ending <-'n' // -eň -> -n not -ň ) // -eš can also lose the -e- but this seems very uncommon - the only // example I've seen is Aleš (a male given name or diminuitive name) // but we require 3 characters before R1 so this won't be considered // for stemming. // // Also this can decline as Alše or Aleše, etc, and these alternative // declensions mean that just removing the -e- from Aleš would not // really help (especially as the forms which keep the -e- seem // more common, at least based on cs.wikipedia.org) so if we did this // we would need to also remove -e- from Aleše, etc, which seems a lot // of complication for a single word. 'et' ( // Conflate e.g. počet with počte, počtu, počty, etc. among ( 'uc' // e.g. tucet but not dvacet. '{cv}' 'h' 'ok' // e.g. loket but not paket. 'kar' // e.g. karet but not cigaret. ) <-'t' ) 'ev' ( // Conflate e.g. církev with církve, církve, církvemu, církví, etc. ev_ending <-'v' ) '{tv}' '{tv}mi' ( // Conflate e.g. oběť and oběťmi with obětech; hradišť with hradišti. <-'t' ) 'i' '{i'}' '{i'}ch' '{i'}ho' '{i'}m' '{i'}mi' '{i'}mu' ( delete try palatalise_i ) ) ) ) define stem as ( mark_regions // Signals f if the input has < 3 characters. backwards ( do case_suffix do possessive_suffix ) ) snowball-3.1.0/algorithms/danish.sbl000066400000000000000000000042411520373054300174600ustar00rootroot00000000000000routines ( mark_regions main_suffix consonant_pair other_suffix undouble ) externals ( stem ) strings ( ch ) integers ( p1 ) groupings ( undouble_c v s_ending ) stringescapes {} /* special characters */ stringdef ae '{U+00E6}' stringdef ao '{U+00E5}' stringdef o/ '{U+00F8}' // Consonants which get undoubled by routine `undouble`. define undouble_c 'bdfgklmnprst' define v 'aeiouy{ae}{ao}{o/}' define s_ending 'abcdfghjklmnoprtvyz{ao}{'}' define mark_regions as ( $p1 = limit do ( ( // If there's an apostrophe, start R1 after it to handle // acronym loanwords such as "pc'en" and "ep'en". gopast '{'}' ) or ( gopast v gopast non-v ) setmark p1 ) // Ensure at least 3 characters before R1. test (hop 3 do ($p1 < cursor $p1 = cursor)) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' 'erets' 'et' 'eret' (delete) 's' (s_ending delete) ) ) define consonant_pair as ( test ( setlimit tomark p1 for ([substring]) among( 'gd' // significant in the call from other_suffix 'dt' 'gt' 'kt' ) ) next] delete ) define other_suffix as ( do ( ['st'] 'ig' delete ) setlimit tomark p1 for ([substring]) among( 'ig' 'lig' 'elig' 'els' (delete do consonant_pair) 'l{o/}st' (<-'l{o/}s') ) ) define undouble as ( setlimit tomark p1 for ([undouble_c] ->ch) ch delete ) ) define stem as ( mark_regions backwards ( do main_suffix do consonant_pair do other_suffix do undouble // Remove trailing apostrophe. ['{'}'] delete ) ) snowball-3.1.0/algorithms/dutch.sbl000066400000000000000000000203311520373054300173170ustar00rootroot00000000000000// Dutch stemming algorithm developed by Wessel Kraaij and Renée Pohlmann strings ( ch ) integers ( p1 p2 ) booleans ( stemmed GE_removed ) routines ( R1 R2 C V VX lengthen_V Step_1 Step_2 Step_3 Step_4 Step_7 Step_6 Step_1c Lose_prefix Lose_infix measure ) externals ( stem ) groupings ( v v_WX A AEIOU AIOU E I O U ) stringescapes {} /* special characters */ stringdef a` '{U+00E0}' stringdef a' '{U+00E1}' stringdef a^ '{U+00E2}' stringdef a" '{U+00E4}' stringdef e` '{U+00E8}' stringdef e' '{U+00E9}' stringdef e^ '{U+00EA}' stringdef e" '{U+00EB}' stringdef i` '{U+00EC}' stringdef i' '{U+00ED}' stringdef i^ '{U+00EE}' stringdef i" '{U+00EF}' stringdef o` '{U+00F2}' stringdef o' '{U+00F3}' stringdef o^ '{U+00F4}' stringdef o" '{U+00F6}' stringdef u` '{U+00F9}' stringdef u' '{U+00FA}' stringdef u^ '{U+00FB}' stringdef u" '{U+00FC}' define A 'a{a"}{a'}{a`}{a^}' define E 'e{e"}{e'}{e`}{e^}' define I 'i{i"}{i'}{i`}{i^}' define O 'o{o"}{o'}{o`}{o^}' define U 'u{u"}{u'}{u`}{u^}' define AIOU A + I + O + U define AEIOU A + E + I + O + U define v AEIOU + 'y' define v_WX v + 'wx' backwardmode ( define R1 as ($p1 <= cursor) define R2 as ($p2 <= cursor) define V as test (v or 'ij') define VX as test (next v or 'ij') define C as test (not 'ij' non-v) define lengthen_V as do ( non-v_WX [substring] among ( 'a' '{a"}' '{a'}' '{a`}' '{a^}' 'o' '{o"}' '{o'}' '{o`}' '{o^}' 'u' '{u"}' '{u'}' '{u`}' '{u^}' (test (non-AEIOU or atlimit) ->ch insert ch) 'e' '{e'}' '{e`}' '{e^}' (test (non-AEIOU or atlimit not (AIOU or (E atlimit)) not (next AIOU non-AEIOU)) ->ch insert ch) 'e{e"}' (<-'e{e"}e') 'i{e"}' (<-'iee') ) ) define Step_1 as ( [substring] among ( '{'}s' (delete) 's' (R1 not ('t' R1) C delete) 'ies' (R1 <-'ie') 'es' ((test ('ar' R1 C) delete lengthen_V) or (test ('er' R1 C) delete) or (R1 C <-'e')) '{e'}s' (R1 <-'{e'}') 'aus' (R1 V <-'au') 'en' (('hed' R1 ] <-'heid') or ('nd' delete) or ('d' R1 C ] delete) or ('i' or 'j' V delete) or (R1 C delete lengthen_V)) 'nde' (<-'nd') ) ) define Step_2 as ( [substring] among ( 'je' (('{'}t' ] delete) or ('et' ] R1 C delete) or ('rnt' ] <-'rn') or ('t' ] R1 VX delete) or ('ink' ] <-'ing') or ('mp' ] <-'m') or ('{'}' ] R1 delete) or (] R1 C delete)) 'ge' (R1 <-'g') 'lijke'(R1 <-'lijk') 'ische'(R1 <-'isch') 'de' (R1 C delete) 'te' (R1 <-'t') 'se' (R1 <-'s') 're' (R1 <-'r') 'le' (R1 delete attach 'l' lengthen_V) 'ene' (R1 C delete attach 'en' lengthen_V) 'ieve' (R1 C <-'ief') ) ) define Step_3 as ( [substring] among ( 'atie' (R1 <-'eer') 'iteit' (R1 delete lengthen_V) 'heid' 'sel' 'ster' (R1 delete) 'rder' (<-'r') 'ing' 'isme' 'erij' (// Exception added to avoid conflating // `schilderij` (painting) and `schild` (shield). ('ild' <- 'er') or (R1 delete lengthen_V)) 'arij' (R1 C <-'aar') 'fie' (R2 delete attach 'f' lengthen_V) 'gie' (R2 delete attach 'g' lengthen_V) 'tst' (R1 C <-'t') 'dst' (R1 C <-'d') ) ) define Step_4 as ( ( [substring] among ( 'ioneel' (R1 <-'ie') 'atief' (R1 <-'eer') 'baar' (R1 delete) 'naar' (R1 V <-'n') 'laar' (R1 V <-'l') 'raar' (R1 V <-'r') 'tant' (R1 <-'teer') 'lijker' 'lijkst' (R1 <-'lijk') 'achtig' 'achtiger' 'achtigst'(R1 delete) 'eriger' 'erigst' 'erig' 'end' (R1 C delete lengthen_V) ) ) or ( [substring] among ( 'iger' 'igst' 'ig' (R1 // Exception added to avoid conflating // `innig` (intimate) and `in` (in). not ('inn' atlimit) C delete lengthen_V) ) ) ) define Step_7 as ( [substring] among ( 'kt' (<-'k') 'ft' (<-'f') 'pt' (<-'p') ) ) define Step_6 as ( [substring] among ( 'bb' (<-'b') 'cc' (<-'c') 'dd' (<-'d') 'ff' (<-'f') 'gg' (<-'g') 'hh' (<-'h') 'jj' (<-'j') 'kk' (<-'k') 'll' (<-'l') 'mm' (<-'m') 'nn' (// Exception added to avoid conflating // `innen` (to collect/cash) and `in` (in). not ('i' atlimit) <-'n') 'pp' (<-'p') 'qq' (<-'q') 'rr' (<-'r') 'ss' (<-'s') 'tt' (<-'t') 'vv' (<-'v') 'ww' (<-'w') 'xx' (<-'x') 'zz' (<-'z') 'v' (<-'f') 'z' (<-'s') ) ) define Step_1c as ( [substring] R1 C among ( 'd' (not ('n' R1) // Exception added to avoid conflating // `geïnd` (collected/cashed) and `in` (in). // Instead we conflate `geïnd` with `innen`. ('in' atlimit <-'n') or delete) 't' (not ('h' R1) // Exception added to avoid conflating // `geënt` (grafted) and `en` (and). not ('en' atlimit) delete ) ) ) ) define Lose_prefix as ( ['ge'] test hop 3 test (gopast ('ij' or v) repeat ('ij' or v) not atlimit) // Exceptions added: among ( // Avoid conflating `geeft` and `effen`/`effende`\`geeffende`. 'eft' (false) // Avoid conflating `gevallen`/`geval` and `vallen`. 'val' (false) 'vali' (true) // Avoid conflating `gevaren`/`gevaar` (danger), `gevaarten` (huge // objects) and `varen` (to sail) 'vaa' 'vare' (false) '' (true) ) set GE_removed delete do ( [substring] among ( '{e"}' (<-'e') '{i"}' (<-'i') ) ) ) define Lose_infix as ( next gopast (['ge']) test hop 3 test (gopast ('ij' or v) repeat ('ij' or v) not atlimit) set GE_removed delete do ( [substring] among ( '{e"}' (<-'e') '{i"}' (<-'i') ) ) ) define measure as ( $p1 = limit $p2 = limit do( repeat non-v atleast 1 ('ij' or v) non-v setmark p1 repeat non-v atleast 1 ('ij' or v) non-v setmark p2 ) ) define stem as ( unset stemmed measure backwards ( do (Step_1 set stemmed ) do (Step_2 set stemmed ) do (Step_3 set stemmed ) do (Step_4 set stemmed ) ) unset GE_removed do (Lose_prefix and measure) backwards ( do (GE_removed set stemmed Step_1c) ) unset GE_removed do (Lose_infix and measure) backwards ( do (GE_removed set stemmed Step_1c) ) backwards ( do (Step_7 set stemmed ) do (stemmed Step_6) ) ) snowball-3.1.0/algorithms/dutch_porter.sbl000066400000000000000000000071131520373054300207150ustar00rootroot00000000000000// Dutch stemming algorithm developed by Martin Porter routines ( prelude postlude e_ending en_ending mark_regions R1 R2 undouble standard_suffix ) externals ( stem ) booleans ( e_found ) integers ( p1 p2 x ) groupings ( v v_I v_j ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef e" '{U+00EB}' stringdef i" '{U+00EF}' stringdef o" '{U+00F6}' stringdef u" '{U+00FC}' stringdef a' '{U+00E1}' stringdef e' '{U+00E9}' stringdef i' '{U+00ED}' stringdef o' '{U+00F3}' stringdef u' '{U+00FA}' stringdef e` '{U+00E8}' define v 'aeiouy{e`}' define v_I v + 'I' define v_j v + 'j' define prelude as ( test repeat ( [substring] among( '{a"}' '{a'}' (<- 'a') '{e"}' '{e'}' (<- 'e') '{i"}' '{i'}' (<- 'i') '{o"}' '{o'}' (<- 'o') '{u"}' '{u'}' (<- 'u') '' (next) ) ) try(['y'] <- 'Y') repeat ( gopast v try ( // If we see `i` not followed by a vowel then we know it couldn't // match on the next iteration so we can advance past it. // // However if we replace `i` with `I` we do need to check the vowel // after the `i` in the next iteration to match the documented // behaviour, e.g. consider input `iiiii`. This may well not make // a difference for any actual Dutch words though. [('i'] do(v <- 'I')) or ('y'] <- 'Y') ) ) ) define mark_regions as ( $p1 = limit $p2 = limit test(hop 3 setmark x) gopast v gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'I' (<- 'i') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define undouble as ( test among('kk' 'dd' 'tt') [next] delete ) define e_ending as ( unset e_found ['e'] R1 test non-v delete set e_found undouble ) define en_ending as ( R1 non-v and not 'gem' delete undouble ) define standard_suffix as ( do ( [substring] among( 'heden' ( R1 <- 'heid' ) 'en' 'ene' ( en_ending ) 's' 'se' ( R1 non-v_j delete ) ) ) do e_ending do ( ['heid'] R2 not 'c' delete ['en'] en_ending ) do ( [substring] among( 'end' 'ing' ( R2 delete (['ig'] R2 not 'e' delete) or undouble ) 'ig' ( R2 not 'e' delete ) 'lijk' ( R2 delete e_ending ) 'baar' ( R2 delete ) 'bar' ( R2 e_found delete ) ) ) do ( non-v_I test ( among ('aa' 'ee' 'oo' 'uu') non-v ) [next] delete ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball-3.1.0/algorithms/english.sbl000066400000000000000000000134301520373054300176430ustar00rootroot00000000000000integers ( p1 p2 ) booleans ( Y_found ) routines ( prelude postlude mark_regions shortv R1 R2 Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5 exception1 ) externals ( stem ) groupings ( aeo v v_WXY valid_LI ) stringescapes {} define aeo 'aeo' define v 'aeiouy' define v_WXY v + 'wxY' define valid_LI 'cdeghkmnrt' define prelude as ( unset Y_found do ( ['{'}'] delete) do ( ['y'] <-'Y' set Y_found) do repeat(goto (v ['y']) <-'Y' set Y_found) ) define mark_regions as ( $p1 = limit $p2 = limit do( among ( 'gener' // generate/general/generic/generous 'commun' // communication/communism/community 'arsen' // arsenic/arsenal 'past' // past/paste 'univers' // universe/universal/university 'later' // lateral/later 'emerg' // emerge/emergency 'organ' // organ/organic/organize 'inter' // intern/internal/international/internment; interfere; interval // ... extensions possible here ... ) or (gopast v gopast non-v) setmark p1 gopast v gopast non-v setmark p2 ) ) backwardmode ( define shortv as ( ( non-v_WXY v non-v ) or ( non-v v atlimit ) or ( 'past' ) // pasted/pasting ) define R1 as $p1 <= cursor define R2 as $p2 <= cursor define Step_1a as ( try ( [substring] among ( '{'}' '{'}s' '{'}s{'}' (delete) ) ) [substring] among ( 'sses' (<-'ss') 'ied' 'ies' ((hop 2 <-'i') or <-'ie') 's' (next gopast v delete) 'us' 'ss' ) ) define Step_1b as ( [substring] among ( 'eed' 'eedly' ( do ( R1 among ( 'proc' 'exc' 'succ' (atlimit) ) or ( <-'ee' ) ) ) 'ed' 'edly' 'ingly' (false) // Handled below. 'ing' ( // Handle exceptional cases here, rest handled below. among ( // dying->die, lying->die, tying->tie, vying->vie 'y' (test(non-v atlimit) ] <-'ie') // Leave inning, outing, etc alone. 'inn' 'out' 'cann' 'herr' 'earr' 'even' (atlimit) ) ) '' () ) or ( // Handle 'ed' 'edly' 'ing' 'ingly' test gopast v delete [] test ( substring among( 'at' 'bl' 'iz' (fail(<- 'e')) 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' // ignoring double c, h, j, k, q, v, w, and x (not (aeo atlimit)) '' (fail(atmark p1 test shortv <- 'e')) ) ) [next] delete ) ) define Step_1c as ( ['y' or 'Y'] non-v not atlimit <-'i' ) define Step_2 as ( [substring] R1 among ( 'tional' (<-'tion') 'enci' (<-'ence') 'anci' (<-'ance') 'abli' (<-'able') 'entli' (<-'ent') 'izer' 'ization' (<-'ize') 'ational' 'ation' 'ator' (<-'ate') 'alism' 'aliti' 'alli' (<-'al') 'fulness' (<-'ful') 'ousli' 'ousness' (<-'ous') 'iveness' 'iviti' (<-'ive') 'biliti' 'bli' (<-'ble') 'ogist' (<-'og') 'ogi' ('l' <-'og') 'fulli' (<-'ful') 'lessli' (<-'less') 'li' (valid_LI delete) ) ) define Step_3 as ( [substring] R1 among ( 'tional' (<-'tion') 'ational' (<-'ate') 'alize' (<-'al') 'icate' 'iciti' 'ical' (<-'ic') 'ful' 'ness' (delete) 'ative' (R2 delete) ) ) define Step_4 as ( [substring] R2 among ( 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' 'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' (delete) 'ion' ('s' or 't' delete) ) ) define Step_5 as ( [substring] among ( 'e' (R2 or (R1 not shortv) delete) 'l' (R2 'l' delete) ) ) ) define exception1 as ( [substring] atlimit among( /* special changes: */ 'skis' (<-'ski') 'skies' (<-'sky') /* special -LY cases */ 'idly' (<-'idl') 'gently' (<-'gentl') 'ugly' (<-'ugli') 'early' (<-'earli') 'only' (<-'onli') 'singly' (<-'singl') // ... extensions possible here ... /* invariant forms: */ 'sky' 'news' 'howe' 'atlas' 'cosmos' 'bias' 'andes' // not plural forms // ... extensions possible here ... ) ) define postlude as (Y_found repeat(goto (['Y']) <-'y')) define stem as ( exception1 or not hop 3 or ( do prelude do mark_regions backwards ( do Step_1a do Step_1b do Step_1c do Step_2 do Step_3 do Step_4 do Step_5 ) do postlude ) ) snowball-3.1.0/algorithms/esperanto.sbl000066400000000000000000000067571520373054300202300ustar00rootroot00000000000000booleans ( foreign ) routines ( canonical_form correlative final_apostrophe initial_apostrophe long_word merged_numeral pronoun standard_suffix ujn_suffix uninflected ) externals ( stem ) groupings ( vowel aou digit ) define vowel 'aeiou' define aou 'aou' define digit '0123456789' stringescapes {} stringdef c^ '{U+0109}' stringdef g^ '{U+011D}' stringdef h^ '{U+0125}' stringdef j^ '{U+0135}' stringdef s^ '{U+015D}' stringdef u+ '{U+016D}' stringdef a' '{U+00E1}' stringdef e' '{U+00E9}' stringdef i' '{U+00ED}' stringdef o' '{U+00F3}' stringdef u' '{U+00FA}' define canonical_form as ( unset foreign repeat ( [substring] among( 'cx' (<- '{c^}') 'gx' (<- '{g^}') 'hx' (<- '{h^}') 'jx' (<- '{j^}') 'sx' (<- '{s^}') 'ux' (<- '{u+}') '{a'}' (<- 'a' set foreign) '{e'}' (<- 'e' set foreign) '{i'}' (<- 'i' set foreign) '{o'}' (<- 'o' set foreign) '{u'}' (<- 'u' set foreign) 'q' 'w' 'x' 'y' (set foreign) '-' (unset foreign) '' (next) ) ) not foreign ) define initial_apostrophe as ( ['{'}'] 'st' among('as' 'i' 'is' 'os' 'u' 'us') atlimit <- 'e' ) backwardmode ( define pronoun as ( [try 'n'] among( 'ci' 'gi' '{g^}i' 'hi' 'ili' 'i{s^}i' 'ivi' 'li' 'mal{s^}i' 'mi' 'ni' 'oni' 'ri' 'si' '{s^}i' '{s^}li' 'vi' ) (atlimit or '-') delete ) define final_apostrophe as ( ['{'}'] ('l' atlimit <- 'a') or ('un' atlimit <- 'u') or ( among( 'adi' 'almen' 'amb' 'ank' 'ankor' 'anstat' 'anta{u+}hier' 'apen' 'bald' '{c^}irk' 'hier' 'hodi' 'kontr' 'kvaz' 'malbald' 'malgr' 'morg' 'postmorg' 'presk' 'tut{c^}irk' ) (atlimit or '-') <- 'a{u+}' ) or (<- 'o') ) define ujn_suffix as ( [try 'n' try 'j'] among('aliu' 'unu') (atlimit or '-') delete ) define uninflected as ( among( 'aha' 'amen' 'dirlididi' 'disde' 'ehe' 'ekde' 'elde' 'haha' 'haleluja' 'hola' 'hosana' 'hura' '{h^}a{h^}a' 'mal{c^}i' 'malkaj' 'malpli' 'maltra' 'maltre' 'maltro' 'minus' 'muu' 'oho' 'tamen' 'uhu' ) (atlimit or '-') ) define merged_numeral as ( among('du' 'tri' 'unu') among('cent' 'dek') ) define correlative as ( [] // Ignore -al, -am, etc. since they can't be confused with suffixes. test ( ((try 'n'] 'e') or (try 'n' try 'j'] aou)) 'i' try among('{c^}' 'k' 'kelk' 'mult' 'nen' 'samt' 't') (atlimit or '-') ) delete ) define long_word as ( loop 2 gopast vowel or (gopast '-' next) or gopast digit ) define standard_suffix as ( [ among( 'a' 'aj' 'ajn' 'an' 'e' 'en' 'i' 'as' 'is' 'os' 'u' 'us' 'o' 'oj' 'ojn' 'on' () 'j' 'jn' 'n' (test ('-' or digit)) ) try '-' ] delete ) ) define stem as ( test canonical_form do initial_apostrophe backwards ( not pronoun do final_apostrophe not correlative not uninflected not merged_numeral not ujn_suffix test long_word standard_suffix ) ) snowball-3.1.0/algorithms/estonian.sbl000066400000000000000000000253571520373054300200450ustar00rootroot00000000000000/* Estonian stemmer Made by Linda Freienthal in January 2019. */ routines ( mark_regions LONGV special_noun_endings case_ending emphasis plural_three_first_cases undouble_kpt i_plural degrees substantive verb_exceptions verb nu ) stringescapes {} stringdef a" '{U+00E4}' //a-umlaut ä stringdef o" '{U+00F6}' //o-umlaut ö stringdef o~ '{U+00F5}' //o with tilde õ stringdef u" '{U+00FC}' //u-umlaut ü stringdef sv '{U+0161}' //s-caron š stringdef zv '{U+017E}' //z-caron ž externals ( stem ) integers ( p1 ) groupings ( V1 RV KI GI) define V1 'aeiou{o~}{a"}{o"}{u"}' define RV 'aeiuo{'}' define KI 'kptgbdshf{sv}z{zv}' define GI 'cjlmnqrvwxaeiou{o~}{a"}{o"}{u"}' define mark_regions as ( $p1 = limit ( hop 2 gopast '{'}' ) or ( gopast V1 gopast non-V1 ) setmark p1 ) backwardmode ( define emphasis as ( setlimit tomark p1 for ([substring]) test hop 4 //kingi -> kingi among( 'gi' ((GI and not LONGV) delete) //jookse-me-gi, bioloogi -> bioloogi 'ki' (KI delete) //kookki -> kook ) ) // Signals t if a replacement was made; f otherwise. define verb as ( setlimit tomark p1 for ([substring]) among( 'nuksin' 'nuksime' 'nuksid' 'nuksite' (delete) //seleta-nuksite 'ksin' 'ksid' 'ksime' 'ksite' (delete) //personal conditional: rõõmusta-ksin 'mata' (delete) 'takse' 'dakse' (delete) //impersonal: laul-dakse, luba-takse 'taks' 'daks' (delete) //impersonal conditional: laul-daks, saade-taks 'akse' (<-'a') //impersonal: tulla-kse, süüa-kse (-> söö), teha-kse (-> tegi), püüta-kse, leita-kse 'sime' (delete) //pl1pst: saat-sime 'site' (delete) //pl2pst: saat-site 'sin' (delete) //sg1pst: laul-sin, saat-sin 'me' (V1 delete) //pl1prs: laula-me, tule-me 'da' (V1 delete) //da-infinitive: luba-da 'n' (V1 delete) //sg1prs: kirjuta-n 'b' (V1 delete) //sg3prs: laula-b ) ) define LONGV as among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}' '{u"}{u"}' '{o~}{o~}') define i_plural as ( setlimit tomark p1 for ([substring]) among( 'i' (RV) //raama-tu-i, lapsiku-i ) delete ) define special_noun_endings as ( setlimit tomark p1 for ([substring]) among( 'lasse' (<- 'lase') //teadlasse -> teadlase 'last' (<- 'lase') //teadlast -> teadlase 'lane' (<- 'lase') //teadlane -> teadlase 'lasi'(<- 'lase') //teadlasi -> teadlase 'misse' (<- 'mise') //tegemisse -> tegemise 'mist' (<- 'mise') //kasutamist -> kasutamise 'mine' (<- 'mise') //tegemine -> tegemise 'misi' (<- 'mise') //kasutamisi -> kasutamise 'lisse' (<- 'lise') //rohelisse -> rohelise 'list' (<- 'lise') //tavalist -> tavalise 'line' (<- 'lise') //roheline -> rohelise 'lisi' (<- 'lise') //tavalisi -> tavalise ) ) define case_ending as ( setlimit tomark p1 for ([substring]) among( 'sse' (RV or LONGV) //illative: saapa-sse 'st' (RV or LONGV) //elative: saapa-st and kapsas-t 'le' (RV or LONGV) //allative: raama-tu-le 'lt' (RV or LONGV) //ablative: raama-tu-lt 'ga' (RV or LONGV) //komitatiive: õpetaja-ga 'ks' (RV or LONGV) //translative: õpetaja-ks 'ta' (RV or LONGV) //abessive and da-infinitive: õpetaja-ta and hüpa-ta 't' (test hop 4) //partitiiv, raamatu-t 's' (RV or LONGV) //inessive and sg3pst: raama-tu-s and sõiti-s 'l' (RV or LONGV) //adessive: raama-tu-l and kapsa-l. ) delete ) define plural_three_first_cases as ( setlimit tomark p1 for ([substring]) among( 'ikkude' (<-'iku') //plural genitive: õnnelikkude -> õnneliku 'ikke' (<-'iku') //plural partitive: rahulikke -> rahuliku 'ike' (<-'iku') //plural genitive: ohtlike -> ohtliku 'sid' (not LONGV delete) //plural partitive and sg2pst and pl3pst: auto-sid and laul-sid (exludes plural nominative with words like gaasid, roosid) // plural genitive and pl2: ministri-te, oluliste -> olulise and saada-te, laula-te; // also torte -> tort (if not in compound word) and kokkuvõtte -> kokkuvõte and roheliste -> rohelise, tegemiste -> tegemise, teadlaste -> teadlase 'te' ( (test hop 4 among ( 'mis' 'las' 'lis' (<- 'e') 't' () '' (delete) ) ) or <- 't' ) 'de' ((RV or LONGV) delete) //plural genitive: lauda-de 'd' ((RV or LONGV) delete) //plural nominative: voodi-d, rattai-d (rata), lapsiku-i-d ) ) define nu as ( setlimit tomark p1 for ([substring]) among( 'nu' //haka-nu(-te-ga) 'tu' //luba-tu(-d) 'du' //laul-du(-te-st) 'va' //laul-va(-te-le) ) delete ) define undouble_kpt as ( // undouble '-C1C1V' where C1 is k, p or t: // mõtte(-le) -> mõte, hakka(-n) -> haka // // We only undouble if the vowel is in R1 to avoid modifying short // non-words (mostly to avoid modifying acronyms/initialisms such // as "PPE"). V1 $(p1 <= cursor) [substring] among( 'kk' (<- 'k') 'pp' (<- 'p') 'tt' (<- 't') ) ) define degrees as ( setlimit tomark p1 for ([substring]) among( 'mai' (RV delete) //heleda-mai(-le) 'ma' (delete) //tuge-va-ma(-le) and ma-infinitive: sõit-ma 'm' (RV delete) //kauge-i-m, rõõmsa-m ) ) define substantive as ( do special_noun_endings do case_ending do plural_three_first_cases do degrees do i_plural do nu ) ) define verb_exceptions as ( [substring] atlimit among( 'joon' 'jood' 'joob' 'joote' 'joome' 'joovad' (<-'joo') 'j{o~}in' 'j{o~}id' 'j{o~}i' 'j{o~}ime' 'j{o~}ite' (<-'joo') 'joomata' 'juuakse' 'joodakse' 'juua' 'jooma' (<- 'joo') 'saan' 'saad' 'saab' 'saate' 'saame' 'saavad' (<-'saa') 'saaksin' 'saaksid' 'saaks' 'saaksite' 'saaksime' (<-'saa') 'sain' 'said' 'sai' 'saite' 'saime' (<-'saa') 'saamata' 'saadakse' 'saadi' 'saama' 'saada' (<-'saa') 'viin' 'viid' 'viib' 'viite' 'viime' 'viivad' (<-'viima') 'viiksin' 'viiksid' 'viiks' 'viiksite' 'viiksime' (<-'viima') 'viisin' 'viisite' 'viisime' (<-'viima') 'viimata' 'viiakse' 'viidi' 'viima' 'viia' (<-'viima') 'keen' 'keeb' 'keed' 'kees' 'keeme' 'keete' 'keevad' (<-'keesi') 'keeksin' 'keeks' 'keeksid' 'keeksime' 'keeksite' (<-'keesi') 'keemata' 'keema' 'keeta' 'keedakse' (<-'keesi') 'l{o"}{o"}n' 'l{o"}{o"}d' 'l{o"}{o"}b' 'l{o"}{o"}me' 'l{o"}{o"}te' 'l{o"}{o"}vad' (<-'l{o"}{o"}') 'l{o"}{o"}ksin' 'l{o"}{o"}ksid' 'l{o"}{o"}ks' 'l{o"}{o"}ksime' 'l{o"}{o"}ksite' (<-'l{o"}{o"}') 'l{o"}{o"}mata' 'l{u"}{u"}akse' 'l{o"}{o"}dakse' 'l{o"}{o"}di' 'l{o"}{o"}ma' 'l{u"}{u"}a' (<-'l{o"}{o"}') // Both looma and lööma have these same past tense forms 'l{o~}in' 'l{o~}id' 'l{o~}i' 'l{o~}ime' 'l{o~}ite' (<-'l{o~}i') 'loon' 'lood' 'loob' 'loome' 'loote' 'loovad' (<-'loo') 'looksin' 'looksid' 'looks' 'looksime' 'looksite' (<-'loo') 'loomata' 'luuakse' 'loodi' 'luua' 'looma' (<-'loo') 'k{a"}in' 'k{a"}ib' 'k{a"}id' 'k{a"}is' 'k{a"}ime' 'k{a"}ite' 'k{a"}ivad' (<-'k{a"}isi') 'k{a"}iksin' 'k{a"}iks' 'k{a"}iksid' 'k{a"}iksime' 'k{a"}iksite' (<-'k{a"}isi') 'k{a"}imata' 'k{a"}iakse' 'k{a"}idi' 'k{a"}ia' 'k{a"}ima' (<-'k{a"}isi') 's{o"}{o"}n' 's{o"}{o"}b' 's{o"}{o"}d' 's{o"}{o"}me' 's{o"}{o"}te' 's{o"}{o"}vad' (<-'s{o"}{o"}') 's{o"}{o"}ksin' 's{o"}{o"}ks' 's{o"}{o"}ksid' 's{o"}{o"}ksime' 's{o"}{o"}ksite' (<-'s{o"}{o"}') 's{o~}in' 's{o~}i' 's{o~}id' 's{o~}ime' 's{o~}ite' (<-'s{o"}{o"}') 's{o"}{o"}mata' 's{u"}{u"}akse' 's{o"}{o"}dakse' 's{o"}{o"}di' 's{o"}{o"}ma' 's{u"}{u"}a' (<-'s{o"}{o"}') 'toon' 'tood' 'toob' 'toote' 'toome' 'toovad' (<-'too') 'tooksin' 'tooksid' 'tooks' 'tooksite' 'tooksime' (<-'too') 't{o~}in' 't{o~}id' 't{o~}i' 't{o~}ime' 't{o~}ite' (<-'too') 'toomata' 'tuuakse' 'toodi' 'tooma' 'tuua' (<-'too') 'v{o~}in' 'v{o~}id' 'v{o~}ib' 'v{o~}ime' 'v{o~}is' 'v{o~}ite' 'v{o~}ivad' (<-'v{o~}isi') 'v{o~}iksin' 'v{o~}iksid' 'v{o~}iks' 'v{o~}iksime' 'v{o~}iksite' (<-'v{o~}isi') 'v{o~}imata' 'v{o~}idakse' 'v{o~}idi' 'v{o~}ida' 'v{o~}ima' (<-'v{o~}isi') 'j{a"}{a"}n' 'j{a"}{a"}d' 'j{a"}{a"}b' 'j{a"}{a"}me' 'j{a"}{a"}te' 'j{a"}{a"}vad' (<-'j{a"}{a"}ma') 'j{a"}{a"}ksin' 'j{a"}{a"}ksid' 'j{a"}{a"}ks' 'j{a"}{a"}ksime' 'j{a"}{a"}ksite' (<-'j{a"}{a"}ma') 'j{a"}ime' 'j{a"}ite' 'j{a"}in' 'j{a"}id' 'j{a"}i' (<-'j{a"}{a"}ma') 'j{a"}{a"}mata' 'j{a"}{a"}dakse' 'j{a"}{a"}da' 'j{a"}{a"}ma' 'j{a"}{a"}di' (<-'j{a"}{a"}ma') 'm{u"}{u"}n' 'm{u"}{u"}d' 'm{u"}{u"}b' 'm{u"}{u"}s' 'm{u"}{u"}me' 'm{u"}{u"}te' 'm{u"}{u"}vad' (<-'m{u"}{u"}si') 'm{u"}{u"}ksin' 'm{u"}{u"}ksid' 'm{u"}{u"}ks' 'm{u"}{u"}ksime' 'm{u"}{u"}ksite' (<-'m{u"}{u"}si') 'm{u"}{u"}mata' 'm{u"}{u"}akse' 'm{u"}{u"}di' 'm{u"}{u"}a' 'm{u"}{u"}ma' (<-'m{u"}{u"}si') 'loeb' 'loen' 'loed' 'loeme' 'loete' 'loevad' (<- 'luge') 'loeks' 'loeksin' 'loeksid' 'loeksime' 'loeksite' (<- 'luge') 'p{o~}en' 'p{o~}eb' 'p{o~}ed' 'p{o~}eme' 'p{o~}ete' 'p{o~}evad' (<- 'p{o~}de') 'p{o~}eksin' 'p{o~}eks' 'p{o~}eksid' 'p{o~}eksime' 'p{o~}eksite' (<- 'p{o~}de') 'laon' 'laob' 'laod' 'laome' 'laote' 'laovad' (<- 'ladu') 'laoksin' 'laoks' 'laoksid' 'laoksime' 'laoksite' (<- 'ladu') 'teeksin' 'teeks' 'teeksid' 'teeksime' 'teeksite' (<- 'tegi') 'teen' 'teeb' 'teed' 'teeme' 'teete' 'teevad' (<- 'tegi') 'tegemata' 'tehakse' 'tehti' 'tegema' 'teha' (<-'tegi') 'n{a"}en' 'n{a"}eb' 'n{a"}ed' 'n{a"}eme' 'n{a"}ete' 'n{a"}evad' (<-'n{a"}gi') 'n{a"}eksin' 'n{a"}eks' 'n{a"}eksid' 'n{a"}eksime' 'n{a"}eksite' (<-'n{a"}gi') 'n{a"}gemata' 'n{a"}hakse' 'n{a"}hti' 'n{a"}ha' 'n{a"}gema' (<-'n{a"}gi') ) ) define stem as ( not verb_exceptions // p1 isn't used by verb_exceptions do mark_regions backwards ( do emphasis do ( verb or substantive ) do undouble_kpt ['{'}'] delete ) ) snowball-3.1.0/algorithms/finnish.sbl000066400000000000000000000130111520373054300176430ustar00rootroot00000000000000 /* Finnish stemmer. Numbers in square brackets refer to the sections in Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999 ISBN 0-415-20705-3 */ routines ( mark_regions R2 particle_etc possessive LV VI A E I O U A_ O_ case_ending i_plural t_plural other_endings tidy ) externals ( stem ) integers ( p1 p2 ) strings ( x ) booleans ( ending_removed ) groupings ( AEI C v particle_end ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef o" '{U+00F6}' stringdef o/ '{U+00F8}' define AEI 'a{a"}ei' define C 'bcdfghjklmnpqrstvwxz' define v 'aeiouy{a"}{o"}' define particle_end v + 'nt' define mark_regions as ( $p1 = limit $p2 = limit gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) backwardmode ( define R2 as $p2 <= cursor define particle_etc as ( setlimit tomark p1 for ([substring]) among( 'kin' 'kaan' 'k{a"}{a"}n' 'ko' 'k{o"}' 'han' 'h{a"}n' 'pa' 'p{a"}' // Particles [91] (particle_end) 'sti' // Adverb [87] (R2) ) delete ) define possessive as ( // [36] setlimit tomark p1 for ([substring]) among( 'si' (not 'k' delete) // take 'ksi' as the Comitative case 'ni' (delete ['kse'] <- 'ksi') // kseni = ksi + ni 'nsa' 'ns{a"}' 'mme' 'nne' (delete) /* Now for Vn possessives after case endings: [36] */ 'an' (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete) '{a"}n' (among('t{a"}' 'ss{a"}' 'st{a"}' 'll{a"}' 'lt{a"}' 'n{a"}') delete) 'en' (among('lle' 'ine') delete) ) ) define LV as among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}') define VI as among('ai' 'ei' 'ii' 'oi' 'ui' '{a"}i' '{o"}i' '{'}') define A as ('a' or '{'}') define E as ('e' or '{'}') define I as ('i' or '{'}') define O as ('o' or '{'}') define U as ('u' or '{'}') define A_ as ('{a"}' or '{'}') // -ø-hön seen with Norwegian place names, e.g. Bodøhön define O_ as ('{o"}' or '{o/}' or '{'}') define case_ending as ( setlimit tomark p1 for ([substring]) among( 'h{a"}n' A_ //-. 'h{o"}n' O_ // | 'han' A // | 'hen' E // | 'hin' I // Illative [43] 'hon' O // | 'hun' U // | 'siin' VI // | 'seen' LV //-' 'den' VI 'tten' VI // Genitive plurals [34] () 'n' // Genitive or Illative ( try ( LV // Illative or 'ie' // Genitive and next ] ) /* otherwise Genitive */ ) 'a' '{a"}' //-. (v C) // | 'tta' 'tt{a"}' // Partitive [32] ('e') // | 'ta' 't{a"}' //-' 'ssa' 'ss{a"}' // Inessive [41] 'sta' 'st{a"}' // Elative [42] 'lla' 'll{a"}' // Adessive [44] 'lta' 'lt{a"}' // Ablative [51] 'lle' // Allative [46] 'na' 'n{a"}' // Essive [49] 'ksi' // Translative[50] 'ine' // Comitative [51] /* Abessive and Instructive are too rare for inclusion [51] */ ) delete set ending_removed ) define other_endings as ( setlimit tomark p2 for ([substring]) among( 'mpi' 'mpa' 'mp{a"}' 'mmi' 'mma' 'mm{a"}' // Comparative forms [85] (not 'po') //-improves things 'impi' 'impa' 'imp{a"}' 'immi' 'imma' 'imm{a"}' // Superlative forms [86] 'eja' 'ej{a"}' // indicates agent [93.1B] ) delete ) define i_plural as ( // [26] setlimit tomark p1 for ([substring]) among( 'i' 'j' ) delete ) define t_plural as ( // [26] setlimit tomark p1 for ( ['t'] test v delete ) setlimit tomark p2 for ([substring]) among( 'mma' (not 'po') //-mmat endings 'imma' //-immat endings ) delete ) define tidy as ( setlimit tomark p1 for ( do ( LV and ([next] delete ) ) // undouble vowel do ( [AEI] C delete ) // remove trailing a, a", e, i do ( ['j'] 'o' or 'u' delete ) do ( ['o'] 'j' delete ) ) do (goto non-v [C] -> x x delete) // undouble consonant ['{'}'] delete ) ) define stem as ( do mark_regions unset ending_removed backwards ( do particle_etc do possessive do case_ending do other_endings (ending_removed do i_plural) or do t_plural do tidy ) ) snowball-3.1.0/algorithms/french.sbl000066400000000000000000000157101520373054300174620ustar00rootroot00000000000000routines ( elisions prelude postlude mark_regions RV R1 R2 standard_suffix i_verb_suffix verb_suffix residual_suffix un_double un_accent ) externals ( stem ) integers ( pV p1 p2 ) groupings ( elision_char v keep_with_s oux_ending ) stringescapes {} /* special characters */ stringdef a^ '{U+00E2}' // a-circumflex stringdef a` '{U+00E0}' // a-grave stringdef cc '{U+00E7}' // c-cedilla stringdef e" '{U+00EB}' // e-diaeresis (rare) stringdef e' '{U+00E9}' // e-acute stringdef e^ '{U+00EA}' // e-circumflex stringdef e` '{U+00E8}' // e-grave stringdef i" '{U+00EF}' // i-diaeresis stringdef i^ '{U+00EE}' // i-circumflex stringdef o^ '{U+00F4}' // o-circumflex stringdef u^ '{U+00FB}' // u-circumflex stringdef u` '{U+00F9}' // u-grave define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' // Replace -oux with -ou if preceded by one of these letters. define oux_ending 'bhjlnp' // Single character elisions define elision_char 'cdjlmnst' define elisions as ( [ (elision_char or 'qu') '{'}' ] not atlimit delete ) define prelude as repeat goto ( ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') or ('y' ] <- 'Y') ) or ( [ '{e"}' ] <- 'He' ) or ( [ '{i"}' ] <- 'Hi' ) or ( ['y'] v <- 'Y' ) or ( 'q' ['u'] <- 'U' ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v v next ) or among ( // Exception list: 'par' // paris, parie, pari 'col' // colis 'tap' // tapis () 'ni' (v) // niais/nierais/nié/niâmes/nièrent // extensions possible here ) or ( next gopast v ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') 'Y' (<- 'y') 'He' (<- '{e"}') 'Hi' (<- '{i"}') 'H' (delete) '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( [substring] among( 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' 'ances' 'iqUes' 'ismes' 'ables' 'istes' ( R2 delete ) 'atrice' 'ateur' 'ation' 'atrices' 'ateurs' 'ations' ( R2 delete try ( ['ic'] (R2 delete) or <-'iqU' ) ) 'logie' 'logies' ( R2 <- 'log' ) 'usion' 'ution' 'usions' 'utions' ( R2 <- 'u' ) 'ence' 'ences' ( R2 <- 'ent' ) 'ement' 'ements' ( RV delete try ( [substring] among( 'iv' (R2 delete ['at'] R2 delete) 'eus' ((R2 delete) or (R1<-'eux')) 'abl' 'iqU' (R2 delete) 'i{e`}r' 'I{e`}r' (RV <-'i') ) ) ) 'it{e'}' 'it{e'}s' ( R2 delete try ( [substring] among( 'abil' ((R2 delete) or <-'abl') 'ic' ((R2 delete) or <-'iqU') 'iv' (R2 delete) ) ) ) 'if' 'ive' 'ifs' 'ives' ( R2 delete try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) ) 'eaux' (<- 'eau') 'aux' (R1 <- 'al') 'oux' (oux_ending <- 'ou') 'euse' 'euses'((R2 delete) or (R1<-'eux')) 'issement' 'issements'(R1 non-v delete) // verbal // fail(...) below forces entry to verb_suffix. -ment typically // follows the p.p., e.g 'confus{e'}ment'. 'amment' (RV fail(<- 'ant')) 'emment' (RV fail(<- 'ent')) 'ment' 'ments' (test(v RV) fail(delete)) // v is e,i,u,{e'},I or U ) ) define i_verb_suffix as setlimit tomark pV for ( [substring] among ( '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' 'issez' 'issiez' 'issions' 'issons' 'it' (not 'H' non-v delete) ) ) define verb_suffix as ( setlimit tomark pV for ([substring]) among ( 'ions' (R2 delete) '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' 'erons' 'eront' 'ez' 'iez' // 'ons' //-best omitted (delete) '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ait' 'ant' 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' 'assions' ( try('e' RV]) delete ) 'ais' 'aise' 'aises' ( not among ( 'al' // balais, calais, galais, malais, palais, valais (next atlimit) 'auv' // mauvais '{e'}pl' // déplais () ) delete ) 'eais' (delete) ) ) define keep_with_s 'aiou{e`}s' define residual_suffix as ( try(['s'] test ('Hi' or non-keep_with_s) delete) setlimit tomark pV for ( [substring] among( 'ion' (R2 's' or 't' delete) 'ier' 'i{e`}re' 'Ier' 'I{e`}re' (<-'i') 'e' (delete) ) ) ) define un_double as ( test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete ) define un_accent as ( atleast 1 non-v [ '{e'}' or '{e`}' ] <-'e' ) ) define stem as ( do elisions do prelude do mark_regions backwards ( do ( ( ( standard_suffix or i_verb_suffix or verb_suffix ) and try( [ ('Y' ] <- 'i' ) or ('{cc}'] <- 'c' ) ) ) or residual_suffix ) // try(['ent'] RV delete) // is best omitted do un_double do un_accent ) do postlude ) snowball-3.1.0/algorithms/german.sbl000066400000000000000000000072111520373054300174630ustar00rootroot00000000000000routines ( prelude postlude mark_regions R1 R2 standard_suffix ) externals ( stem ) integers ( p1 p2 x ) groupings ( v et_ending s_ending st_ending ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef o" '{U+00F6}' stringdef u" '{U+00FC}' stringdef ss '{U+00DF}' define v 'aeiouy{a"}{o"}{u"}' define et_ending 'dfgklmnrstUz{a"}' define s_ending 'bdfghklmnrt' define st_ending s_ending - 'r' define prelude as ( test repeat goto ( v [('u'] v <- 'U') or ('y'] v <- 'Y') ) repeat ( [substring] among( '{ss}' (<- 'ss') 'ae' (<- '{a"}') 'oe' (<- '{o"}') 'ue' (<- '{u"}') 'qu' () '' (next) ) ) ) define mark_regions as ( $p1 = limit $p2 = limit test(hop 3 setmark x) gopast v gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'U' (<- 'u') '{a"}' (<- 'a') '{o"}' (<- 'o') '{u"}' (<- 'u') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( do ( [substring] R1 among( 'em' ( not 'syst' // don't remove -em from words ending -system delete ) 'ern' 'er' 'erin' 'erinnen' // conflate female versions of nouns ( delete ) 'e' 'en' 'es' ( delete try (['s'] 'nis' delete) ) 's' ( s_ending delete ) 'ln' 'lns' ( <- 'l' ) ) ) do ( [substring] R1 among( 'en' 'er' 'est' ( delete ) 'st' ( st_ending hop 3 delete ) 'et' ( test et_ending not among ( 'geordn' // Still conflate untergeordnet/untergeordnetere, etc. 'intern' // Don't conflate Internet and internes. 'plan' // Don't conflate Plan and Planet. 'tick' // Don't conflate Tick and Ticket. 'tr' // Still conflate Vertreter/Vertretung, etc. ) delete ) ) ) do ( [substring] R2 among( 'end' 'ung' ( delete try (['ig'] not 'e' R2 delete) ) 'ig' 'ik' 'isch' ( not 'e' delete ) 'lich' 'heit' ( delete try ( ['er' or 'en'] R1 delete ) ) 'keit' ( delete try ( [substring] R2 among( 'lich' 'ig' ( delete ) ) ) ) ) ) do ( [substring] among( '{'}s' '{'}sch' '{'}' (next not atlimit delete) ) ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball-3.1.0/algorithms/greek.sbl000066400000000000000000000646231520373054300173210ustar00rootroot00000000000000// A stemmer for Modern Greek language, based on: // // Ntais, Georgios. Development of a Stemmer for the Greek // Language. Diss. Royal Institute of Technology, 2006. // https://sais.se/mthprize/2007/ntais2007.pdf // // Saroukos, Spyridon. Enhancing a Greek language stemmer. // University of Tampere, 2008. // https://trepo.tuni.fi/bitstream/handle/10024/80480/gradu03463.pdf stringescapes {} stringdef a '{U+03B1}' // alpha stringdef v '{U+03B2}' // beta stringdef g '{U+03B3}' // gamma stringdef d '{U+03B4}' // delta stringdef e '{U+03B5}' // epsilon stringdef z '{U+03B6}' // zeta stringdef i '{U+03B7}' // eta stringdef th '{U+03B8}' // theta stringdef y '{U+03B9}' // iota stringdef k '{U+03BA}' // kappa stringdef l '{U+03BB}' // lambda stringdef m '{U+03BC}' // mu stringdef n '{U+03BD}' // nu stringdef x '{U+03BE}' // xi stringdef o '{U+03BF}' // omicron stringdef p '{U+03C0}' // pi stringdef r '{U+03C1}' // rho stringdef ss '{U+03C2}' // sigma final stringdef s '{U+03C3}' // sigma stringdef t '{U+03C4}' // tau stringdef u '{U+03C5}' // upsilon stringdef f '{U+03C6}' // phi stringdef ch '{U+03C7}' // chi stringdef ps '{U+03C8}' // psi stringdef oo '{U+03C9}' // omega stringdef A '{U+0391}' // Alpha stringdef V '{U+0392}' // Beta stringdef G '{U+0393}' // Gamma stringdef D '{U+0394}' // Delta stringdef E '{U+0395}' // Epsilon stringdef Z '{U+0396}' // Zeta stringdef I '{U+0397}' // Eta stringdef Th '{U+0398}' // Theta stringdef Y '{U+0399}' // Iota stringdef K '{U+039A}' // Kappa stringdef L '{U+039B}' // Lambda stringdef M '{U+039C}' // Mu stringdef N '{U+039D}' // Nu stringdef X '{U+039E}' // Xi stringdef O '{U+039F}' // Omicron stringdef P '{U+03A0}' // Pi stringdef R '{U+03A1}' // Rho stringdef S '{U+03A3}' // Sigma stringdef T '{U+03A4}' // Tau stringdef U '{U+03A5}' // Upsilon stringdef F '{U+03A6}' // Phi stringdef Ch '{U+03A7}' // Chi stringdef Ps '{U+03A8}' // Psi stringdef Oo '{U+03A9}' // Omega stringdef Y: '{U+03AA}' // Iota with dialytika stringdef U: '{U+03AB}' // Upsilon with dialytika stringdef a' '{U+03AC}' // alpha with tonos stringdef e' '{U+03AD}' // epsilon with tonos stringdef i' '{U+03AE}' // eta with tonos stringdef y' '{U+03AF}' // iota with tonos stringdef o' '{U+03CC}' // omicron with tonos stringdef u' '{U+03CD}' // upsilon with tonos stringdef oo' '{U+03CE}' // omega with tonos stringdef i:' '{U+0390}' // iota with dialytika and tonos stringdef u:' '{U+03B0}' // upsilon with dialytika and tonos stringdef i: '{U+03CA}' // iota with dialytika stringdef u: '{U+03CB}' // upsilon with dialytika stringdef A' '{U+0386}' // Alpha with tonos stringdef E' '{U+0388}' // Epsilon with tonos stringdef I' '{U+0389}' // Eta with tonos stringdef Y' '{U+038A}' // Iota with tonos stringdef O' '{U+038C}' // Omicron with tonos stringdef U' '{U+038E}' // Upsilon with tonos stringdef OO' '{U+038F}' // Omega with tonos externals ( stem ) booleans ( test1 ) groupings ( v v2 ) routines ( tolower has_min_length step_s1 step_s2 step_s3 step_s4 step_s5 step_s6 step_s7 step_s8 step_s9 step_s10 step_1 step_2a step_2b step_2c step_2d step_3 step_4 step_5a step_5b step_5c step_5d step_5e step_5f step_5g step_5h step_5i step_5j step_5k step_5l step_5m step_6 step_7 ) define v '{a}{e}{i}{y}{o}{u}{oo}' define v2 '{a}{e}{i}{y}{o}{oo}' backwardmode ( define has_min_length as ( $(len >= 3) ) define tolower as ( repeat ( [substring] among ( '{A}' (<- '{a}') '{V}' (<- '{v}') '{G}' (<- '{g}') '{D}' (<- '{d}') '{E}' (<- '{e}') '{Z}' (<- '{z}') '{I}' (<- '{i}') '{Th}' (<- '{th}') '{Y}' (<- '{y}') '{K}' (<- '{k}') '{L}' (<- '{l}') '{M}' (<- '{m}') '{N}' (<- '{n}') '{X}' (<- '{x}') '{O}' (<- '{o}') '{P}' (<- '{p}') '{R}' (<- '{r}') '{S}' (<- '{s}') '{T}' (<- '{t}') '{U}' (<- '{u}') '{F}' (<- '{f}') '{Ch}' (<- '{ch}') '{Ps}' (<- '{ps}') '{Oo}' (<- '{oo}') '{Y:}' (<- '{y}') '{U:}' (<- '{u}') '{a'}' (<- '{a}') '{e'}' (<- '{e}') '{i'}' (<- '{i}') '{y'}' (<- '{y}') '{o'}' (<- '{o}') '{u'}' (<- '{u}') '{oo'}' (<- '{oo}') '{i:'}' (<- '{i}') '{u:'}' (<- '{u}') '{i:}' (<- '{i}') '{u:}' (<- '{u}') '{A'}' (<- '{a}') '{E'}' (<- '{e}') '{I'}' (<- '{i}') '{Y'}' (<- '{y}') '{O'}' (<- '{o}') '{U'}' (<- '{u}') '{OO'}' (<- '{oo}') '{ss}' (<- '{s}') '' (next) ) ) ) define step_1 as ( [substring] among ( '{f}{a}{g}{y}{a}' '{f}{a}{g}{y}{o}{u}' '{f}{a}{g}{y}{oo}{n}' (<- '{f}{a}') '{s}{k}{a}{g}{y}{a}' '{s}{k}{a}{g}{y}{o}{u}' '{s}{k}{a}{g}{y}{oo}{n}' (<- '{s}{k}{a}') '{o}{l}{o}{g}{y}{o}{u}' '{o}{l}{o}{g}{y}{a}' '{o}{l}{o}{g}{y}{oo}{n}' (<- '{o}{l}{o}') '{s}{o}{g}{y}{o}{u}' '{s}{o}{g}{y}{a}' '{s}{o}{g}{y}{oo}{n}' (<- '{s}{o}') '{t}{a}{t}{o}{g}{y}{a}' '{t}{a}{t}{o}{g}{y}{o}{u}' '{t}{a}{t}{o}{g}{y}{oo}{n}' (<- '{t}{a}{t}{o}') '{k}{r}{e}{a}{s}' '{k}{r}{e}{a}{t}{o}{s}' '{k}{r}{e}{a}{t}{a}' '{k}{r}{e}{a}{t}{oo}{n}' (<- '{k}{r}{e}') '{p}{e}{r}{a}{s}' '{p}{e}{r}{a}{t}{o}{s}' '{p}{e}{r}{a}{t}{i}' '{p}{e}{r}{a}{t}{a}' '{p}{e}{r}{a}{t}{oo}{n}' (<- '{p}{e}{r}') '{t}{e}{r}{a}{s}' '{t}{e}{r}{a}{t}{o}{s}' '{t}{e}{r}{a}{t}{a}' '{t}{e}{r}{a}{t}{oo}{n}' (<- '{t}{e}{r}') '{f}{oo}{s}' '{f}{oo}{t}{o}{s}' '{f}{oo}{t}{a}' '{f}{oo}{t}{oo}{n}' (<- '{f}{oo}') '{k}{a}{th}{e}{s}{t}{oo}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{o}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{a}' '{k}{a}{th}{e}{s}{t}{oo}{t}{oo}{n}' (<- '{k}{a}{th}{e}{s}{t}') '{g}{e}{g}{o}{n}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{a}' '{g}{e}{g}{o}{n}{o}{t}{oo}{n}' (<- '{g}{e}{g}{o}{n}') ) unset test1 ) define step_s1 as ( [substring] among ( '{y}{z}{a}' '{y}{z}{e}{s}' '{y}{z}{e}' '{y}{z}{a}{m}{e}' '{y}{z}{a}{t}{e}' '{y}{z}{a}{n}' '{y}{z}{a}{n}{e}' '{y}{z}{oo}' '{y}{z}{e}{y}{s}' '{y}{z}{e}{y}' '{y}{z}{o}{u}{m}{e}' '{y}{z}{e}{t}{e}' '{y}{z}{o}{u}{n}' '{y}{z}{o}{u}{n}{e}' ( delete unset test1 ([] substring atlimit among ( '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{p}{a}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' (<- '{y}') '{m}{a}{r}{k}' '{k}{o}{r}{n}' '{a}{m}{p}{a}{r}' '{a}{r}{r}' '{v}{a}{th}{u}{r}{y}' '{v}{a}{r}{k}' '{v}' '{v}{o}{l}{v}{o}{r}' '{g}{k}{r}' '{g}{l}{u}{k}{o}{r}' '{g}{l}{u}{k}{u}{r}' '{y}{m}{p}' '{l}' '{l}{o}{u}' '{m}{a}{r}' '{m}' '{p}{r}' '{m}{p}{r}' '{p}{o}{l}{u}{r}' '{p}' '{r}' '{p}{y}{p}{e}{r}{o}{r}' (<- '{y}{z}') )) ) ) ) define step_s2 as ( [substring] among ( '{oo}{th}{i}{k}{a}' '{oo}{th}{i}{k}{e}{s}' '{oo}{th}{i}{k}{e}' '{oo}{th}{i}{k}{a}{m}{e}' '{oo}{th}{i}{k}{a}{t}{e}' '{oo}{th}{i}{k}{a}{n}' '{oo}{th}{i}{k}{a}{n}{e}' ( delete unset test1 [] substring atlimit among ( '{a}{l}' '{v}{y}' '{e}{n}' '{u}{ps}' '{l}{y}' '{z}{oo}' '{s}' '{ch}' (<- '{oo}{n}') ) ) ) ) define step_s3 as ( (['{y}{s}{a}'] atlimit <- '{y}{s}') or [substring] among ( '{y}{s}{a}' '{y}{s}{e}{s}' '{y}{s}{e}' '{y}{s}{a}{m}{e}' '{y}{s}{a}{t}{e}' '{y}{s}{a}{n}' '{y}{s}{a}{n}{e}' ( delete unset test1 ([] substring atlimit among ( '{a}{n}{a}{m}{p}{a}' '{a}{th}{r}{o}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}' (<- '{y}') '{a}{n}' '{a}{f}' '{g}{e}' '{g}{y}{g}{a}{n}{t}{o}{a}{f}' '{g}{k}{e}' '{d}{i}{m}{o}{k}{r}{a}{t}' '{k}{o}{m}' '{g}{k}' '{m}' '{p}' '{p}{o}{u}{k}{a}{m}' '{o}{l}{o}' '{l}{a}{r}' (<- '{y}{s}') )) ) ) ) define step_s4 as ( [substring] among ( '{y}{s}{oo}' '{y}{s}{e}{y}{s}' '{y}{s}{e}{y}' '{y}{s}{o}{u}{m}{e}' '{y}{s}{e}{t}{e}' '{y}{s}{o}{u}{n}' '{y}{s}{o}{u}{n}{e}' ( delete unset test1 [] substring atlimit among ( '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}' (<- '{y}') ) ) ) ) define step_s5 as ( [substring] among ( '{y}{s}{t}{o}{s}' '{y}{s}{t}{o}{u}' '{y}{s}{t}{o}' '{y}{s}{t}{e}' '{y}{s}{t}{o}{y}' '{y}{s}{t}{oo}{n}' '{y}{s}{t}{o}{u}{s}' '{y}{s}{t}{i}' '{y}{s}{t}{i}{s}' '{y}{s}{t}{a}' '{y}{s}{t}{e}{s}' ( delete unset test1 ([] substring atlimit among ( '{d}{a}{n}{e}' '{s}{u}{n}{a}{th}{r}{o}' '{k}{l}{e}' '{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{a}{s}{e}' '{p}{l}{e}' (<- '{y}') '{m}' '{p}' '{a}{p}' '{a}{r}' '{i}{d}' '{k}{t}' '{s}{k}' '{s}{ch}' '{u}{ps}' '{f}{a}' '{ch}{r}' '{ch}{t}' '{a}{k}{t}' '{a}{o}{r}' '{a}{s}{ch}' '{a}{t}{a}' '{a}{ch}{n}' '{a}{ch}{t}' '{g}{e}{m}' '{g}{u}{r}' '{e}{m}{p}' '{e}{u}{p}' '{e}{ch}{th}' '{i}{f}{a}' '{k}{a}{th}' '{k}{a}{k}' '{k}{u}{l}' '{l}{u}{g}' '{m}{a}{k}' '{m}{e}{g}' '{t}{a}{ch}' '{f}{y}{l}' '{ch}{oo}{r}' (<- '{y}{s}{t}') )) ) ) ) define step_s6 as ( [substring] among ( '{y}{s}{m}{o}' '{y}{s}{m}{o}{y}' '{y}{s}{m}{o}{s}' '{y}{s}{m}{o}{u}' '{y}{s}{m}{o}{u}{s}' '{y}{s}{m}{oo}{n}' ( delete unset test1 ([] substring atlimit among ( '{s}{e}' '{m}{e}{t}{a}{s}{e}' '{m}{y}{k}{r}{o}{s}{e}' '{e}{g}{k}{l}{e}' '{a}{p}{o}{k}{l}{e}' (<- '{y}{s}{m}') '{d}{a}{n}{e}' '{a}{n}{t}{y}{d}{a}{n}{e}' (<- '{y}') )) or ([substring] among ( '{a}{g}{n}{oo}{s}{t}{y}{k}' (<- '{a}{g}{n}{oo}{s}{t}') '{a}{t}{o}{m}{y}{k}' (<- '{a}{t}{o}{m}') '{g}{n}{oo}{s}{t}{y}{k}' (<- '{g}{n}{oo}{s}{t}') '{e}{th}{n}{y}{k}' (<- '{e}{th}{n}') '{e}{k}{l}{e}{k}{t}{y}{k}' (<- '{e}{k}{l}{e}{k}{t}') '{s}{k}{e}{p}{t}{y}{k}' (<- '{s}{k}{e}{p}{t}') '{t}{o}{p}{y}{k}' (<- '{t}{o}{p}') '{a}{l}{e}{x}{a}{n}{d}{r}{y}{n}' (<- '{a}{l}{e}{x}{a}{n}{d}{r}') '{v}{u}{z}{a}{n}{t}{y}{n}' (<- '{v}{u}{z}{a}{n}{t}') '{th}{e}{a}{t}{r}{y}{n}' (<- '{th}{e}{a}{t}{r}') )) ) ) ) define step_s7 as ( [substring] among ( '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' '{o}{u}{d}{a}{k}{y}' '{o}{u}{d}{a}{k}{y}{a}' ( delete unset test1 [] substring atlimit among ( '{s}' '{ch}' (<- '{a}{r}{a}{k}') ) ) ) ) define step_s8 as ( [substring] among ( '{a}{k}{y}' '{a}{k}{y}{a}' '{y}{t}{s}{a}' '{y}{t}{s}{a}{s}' '{y}{t}{s}{e}{s}' '{y}{t}{s}{oo}{n}' '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' ( delete unset test1 ([] substring atlimit among ( '{v}{a}{m}{v}' '{v}{r}' '{k}{a}{y}{m}' '{k}{o}{n}' '{k}{o}{r}' '{l}{a}{v}{r}' '{l}{o}{u}{l}' '{m}{e}{r}' '{m}{o}{u}{s}{t}' '{n}{a}{g}{k}{a}{s}' '{p}{l}' '{r}' '{r}{u}' '{s}' '{s}{k}' '{s}{o}{k}' '{s}{p}{a}{n}' '{t}{z}' '{f}{a}{r}{m}' '{ch}' '{k}{a}{p}{a}{k}' '{a}{l}{y}{s}{f}' '{a}{m}{v}{r}' '{a}{n}{th}{r}' '{k}' '{f}{u}{l}' '{k}{a}{t}{r}{a}{p}' '{k}{l}{y}{m}' '{m}{a}{l}' '{s}{l}{o}{v}' '{f}' '{s}{f}' '{t}{s}{e}{ch}{o}{s}{l}{o}{v}' (<- '{a}{k}') '{v}' '{v}{a}{l}' '{g}{y}{a}{n}' '{g}{l}' '{z}' '{i}{g}{o}{u}{m}{e}{n}' '{k}{a}{r}{d}' '{m}{a}{k}{r}{u}{n}' '{n}{u}{f}' '{p}{a}{t}{e}{r}' '{p}' '{t}{o}{s}' '{t}{r}{y}{p}{o}{l}' // We're implementing the revised algorithm from the Saroukos paper // which also lists '{k}{o}{n}' and '{s}{k}' here, but these are // also listed just above in the `Add {a}{k} in the end` exception. // It seems they're redundant here, so we omit them (otherwise the // Snowball compiler would report an error). (<- '{y}{t}{s}') )) or ([] '{k}{o}{r}' <- '{y}{t}{s}') ) ) ) define step_s9 as ( [substring] among ( '{y}{d}{y}{o}' '{y}{d}{y}{a}' '{y}{d}{y}{oo}{n}' ( delete unset test1 ([] substring atlimit among ( '{a}{y}{f}{n}' '{y}{r}' '{o}{l}{o}' '{ps}{a}{l}' (<- '{y}{d}') )) or ([] substring among ( '{e}' '{p}{a}{y}{ch}{n}' (<- '{y}{d}') )) ) ) ) define step_s10 as ( [substring] among ( '{y}{s}{k}{o}{s}' '{y}{s}{k}{o}{u}' '{y}{s}{k}{o}' '{y}{s}{k}{e}' ( delete unset test1 [] substring atlimit among ( '{d}' '{y}{v}' '{m}{i}{n}' '{r}' '{f}{r}{a}{g}{k}' '{l}{u}{k}' '{o}{v}{e}{l}' (<- '{y}{s}{k}') ) ) ) ) define step_2a as ( [substring] among ( '{a}{d}{e}{s}' '{a}{d}{oo}{n}' (delete) ) not (substring among ( '{o}{k}' '{m}{a}{m}' '{m}{a}{n}' '{m}{p}{a}{m}{p}' '{p}{a}{t}{e}{r}' '{g}{y}{a}{g}{y}' '{n}{t}{a}{n}{t}' '{k}{u}{r}' '{th}{e}{y}' '{p}{e}{th}{e}{r}' )) insert '{a}{d}' ) define step_2b as ( [substring] among ( '{e}{d}{e}{s}' '{e}{d}{oo}{n}' (delete) ) [] substring among ( '{o}{p}' '{y}{p}' '{e}{m}{p}' '{u}{p}' '{g}{i}{p}' '{d}{a}{p}' '{k}{r}{a}{s}{p}' '{m}{y}{l}' (<- '{e}{d}') ) ) define step_2c as ( [substring] among ( '{o}{u}{d}{e}{s}' '{o}{u}{d}{oo}{n}' (delete) ) [] substring among ( '{a}{r}{k}' '{k}{a}{l}{y}{a}{k}' '{p}{e}{t}{a}{l}' '{l}{y}{ch}' '{p}{l}{e}{x}' '{s}{k}' '{s}' '{f}{l}' '{f}{r}' '{v}{e}{l}' '{l}{o}{u}{l}' '{ch}{n}' '{s}{p}' '{t}{r}{a}{g}' '{f}{e}' (<- '{o}{u}{d}') ) ) define step_2d as ( [substring] among ( '{e}{oo}{s}' '{e}{oo}{n}' (delete unset test1) ) [] substring atlimit among ( '{th}' '{d}' '{e}{l}' '{g}{a}{l}' '{n}' '{p}' '{y}{d}' '{p}{a}{r}' (<- '{e}') ) ) define step_3 as ( [substring] among ( '{y}{a}' '{y}{o}{u}' '{y}{oo}{n}' (delete unset test1) ) ([] v <- '{y}') ) define step_4 as ( [substring] among ( '{y}{k}{a}' '{y}{k}{o}' '{y}{k}{o}{u}' '{y}{k}{oo}{n}' (delete unset test1) ) ([] v <- '{y}{k}') or [] substring atlimit among ( '{a}{l}' '{a}{d}' '{e}{n}{d}' '{a}{m}{a}{n}' '{a}{m}{m}{o}{ch}{a}{l}' '{i}{th}' '{a}{n}{i}{th}' '{a}{n}{t}{y}{d}' '{f}{u}{s}' '{v}{r}{oo}{m}' '{g}{e}{r}' '{e}{x}{oo}{d}' '{k}{a}{l}{p}' '{k}{a}{l}{l}{y}{n}' '{k}{a}{t}{a}{d}' '{m}{o}{u}{l}' '{m}{p}{a}{n}' '{m}{p}{a}{g}{y}{a}{t}' '{m}{p}{o}{l}' '{m}{p}{o}{s}' '{n}{y}{t}' '{x}{y}{k}' '{s}{u}{n}{o}{m}{i}{l}' '{p}{e}{t}{s}' '{p}{y}{t}{s}' '{p}{y}{k}{a}{n}{t}' '{p}{l}{y}{a}{t}{s}' '{p}{o}{s}{t}{e}{l}{n}' '{p}{r}{oo}{t}{o}{d}' '{s}{e}{r}{t}' '{s}{u}{n}{a}{d}' '{t}{s}{a}{m}' '{u}{p}{o}{d}' '{f}{y}{l}{o}{n}' '{f}{u}{l}{o}{d}' '{ch}{a}{s}' (<- '{y}{k}') ) ) define step_5a as ( do (['{a}{g}{a}{m}{e}'] atlimit <- '{a}{g}{a}{m}') do ( [substring] among ( '{a}{g}{a}{m}{e}' '{i}{s}{a}{m}{e}' '{o}{u}{s}{a}{m}{e}' '{i}{k}{a}{m}{e}' '{i}{th}{i}{k}{a}{m}{e}' (delete unset test1) ) ) ['{a}{m}{e}'] delete unset test1 [] substring atlimit among ( '{a}{n}{a}{p}' '{a}{p}{o}{th}' '{a}{p}{o}{k}' '{a}{p}{o}{s}{t}' '{v}{o}{u}{v}' '{x}{e}{th}' '{o}{u}{l}' '{p}{e}{th}' '{p}{y}{k}{r}' '{p}{o}{t}' '{s}{y}{ch}' '{ch}' (<- '{a}{m}') ) ) define step_5b as ( do ( [substring] among ( '{a}{g}{a}{n}{e}' '{i}{s}{a}{n}{e}' '{o}{u}{s}{a}{n}{e}' '{y}{o}{n}{t}{a}{n}{e}' '{y}{o}{t}{a}{n}{e}' '{y}{o}{u}{n}{t}{a}{n}{e}' '{o}{n}{t}{a}{n}{e}' '{o}{t}{a}{n}{e}' '{o}{u}{n}{t}{a}{n}{e}' '{i}{k}{a}{n}{e}' '{i}{th}{i}{k}{a}{n}{e}' ( delete unset test1 [] substring atlimit among ( '{t}{r}' '{t}{s}' (<- '{a}{g}{a}{n}') ) ) ) ) ['{a}{n}{e}'] delete unset test1 ([] v2 <- '{a}{n}') or [] substring atlimit among ( '{v}{e}{t}{e}{r}' '{v}{o}{u}{l}{k}' '{v}{r}{a}{ch}{m}' '{g}' '{d}{r}{a}{d}{o}{u}{m}' '{th}' '{k}{a}{l}{p}{o}{u}{z}' '{k}{a}{s}{t}{e}{l}' '{k}{o}{r}{m}{o}{r}' '{l}{a}{o}{p}{l}' '{m}{oo}{a}{m}{e}{th}' '{m}' '{m}{o}{u}{s}{o}{u}{l}{m}' '{n}' '{o}{u}{l}' '{p}' '{p}{e}{l}{e}{k}' '{p}{l}' '{p}{o}{l}{y}{s}' '{p}{o}{r}{t}{o}{l}' '{s}{a}{r}{a}{k}{a}{t}{s}' '{s}{o}{u}{l}{t}' '{t}{s}{a}{r}{l}{a}{t}' '{o}{r}{f}' '{t}{s}{y}{g}{g}' '{t}{s}{o}{p}' '{f}{oo}{t}{o}{s}{t}{e}{f}' '{ch}' '{ps}{u}{ch}{o}{p}{l}' '{a}{g}' '{g}{a}{l}' '{g}{e}{r}' '{d}{e}{k}' '{d}{y}{p}{l}' '{a}{m}{e}{r}{y}{k}{a}{n}' '{o}{u}{r}' '{p}{y}{th}' '{p}{o}{u}{r}{y}{t}' '{s}' '{z}{oo}{n}{t}' '{y}{k}' '{k}{a}{s}{t}' '{k}{o}{p}' '{l}{y}{ch}' '{l}{o}{u}{th}{i}{r}' '{m}{a}{y}{n}{t}' '{m}{e}{l}' '{s}{y}{g}' '{s}{p}' '{s}{t}{e}{g}' '{t}{r}{a}{g}' '{t}{s}{a}{g}' '{f}' '{e}{r}' '{a}{d}{a}{p}' '{a}{th}{y}{g}{g}' '{a}{m}{i}{ch}' '{a}{n}{y}{k}' '{a}{n}{o}{r}{g}' '{a}{p}{i}{g}' '{a}{p}{y}{th}' '{a}{t}{s}{y}{g}{g}' '{v}{a}{s}' '{v}{a}{s}{k}' '{v}{a}{th}{u}{g}{a}{l}' '{v}{y}{o}{m}{i}{ch}' '{v}{r}{a}{ch}{u}{k}' '{d}{y}{a}{t}' '{d}{y}{a}{f}' '{e}{n}{o}{r}{g}' '{th}{u}{s}' '{k}{a}{p}{n}{o}{v}{y}{o}{m}{i}{ch}' '{k}{a}{t}{a}{g}{a}{l}' '{k}{l}{y}{v}' '{k}{o}{y}{l}{a}{r}{f}' '{l}{y}{v}' '{m}{e}{g}{l}{o}{v}{y}{o}{m}{i}{ch}' '{m}{y}{k}{r}{o}{v}{y}{o}{m}{i}{ch}' '{n}{t}{a}{v}' '{x}{i}{r}{o}{k}{l}{y}{v}' '{o}{l}{y}{g}{o}{d}{a}{m}' '{o}{l}{o}{g}{a}{l}' '{p}{e}{n}{t}{a}{r}{f}' '{p}{e}{r}{i}{f}' '{p}{e}{r}{y}{t}{r}' '{p}{l}{a}{t}' '{p}{o}{l}{u}{d}{a}{p}' '{p}{o}{l}{u}{m}{i}{ch}' '{s}{t}{e}{f}' '{t}{a}{v}' '{t}{e}{t}' '{u}{p}{e}{r}{i}{f}' '{u}{p}{o}{k}{o}{p}' '{ch}{a}{m}{i}{l}{o}{d}{a}{p}' '{ps}{i}{l}{o}{t}{a}{v}' (<- '{a}{n}') ) ) define step_5c as ( do ( [substring] among ( '{i}{s}{e}{t}{e}' (delete unset test1) ) ) ['{e}{t}{e}'] delete unset test1 ([] v2 <- '{e}{t}') or ([] substring among ( '{o}{d}' '{a}{y}{r}' '{f}{o}{r}' '{t}{a}{th}' '{d}{y}{a}{th}' '{s}{ch}' '{e}{n}{d}' '{e}{u}{r}' '{t}{y}{th}' '{u}{p}{e}{r}{th}' '{r}{a}{th}' '{e}{n}{th}' '{r}{o}{th}' '{s}{th}' '{p}{u}{r}' '{a}{y}{n}' '{s}{u}{n}{d}' '{s}{u}{n}' '{s}{u}{n}{th}' '{ch}{oo}{r}' '{p}{o}{n}' '{v}{r}' '{k}{a}{th}' '{e}{u}{th}' '{e}{k}{th}' '{n}{e}{t}' '{r}{o}{n}' '{a}{r}{k}' '{v}{a}{r}' '{v}{o}{l}' '{oo}{f}{e}{l}' (<- '{e}{t}') )) or [] substring atlimit among ( '{a}{v}{a}{r}' '{v}{e}{n}' '{e}{n}{a}{r}' '{a}{v}{r}' '{a}{d}' '{a}{th}' '{a}{n}' '{a}{p}{l}' '{v}{a}{r}{o}{n}' '{n}{t}{r}' '{s}{k}' '{k}{o}{p}' '{m}{p}{o}{r}' '{n}{y}{f}' '{p}{a}{g}' '{p}{a}{r}{a}{k}{a}{l}' '{s}{e}{r}{p}' '{s}{k}{e}{l}' '{s}{u}{r}{f}' '{t}{o}{k}' '{u}' '{d}' '{e}{m}' '{th}{a}{r}{r}' '{th}' (<- '{e}{t}') ) ) define step_5d as ( [substring] among ( '{o}{n}{t}{a}{s}' '{oo}{n}{t}{a}{s}' ( delete unset test1 ([] '{a}{r}{ch}' atlimit <- '{o}{n}{t}') or ([] '{k}{r}{e}' <- '{oo}{n}{t}') ) ) ) define step_5e as ( [substring] among ( '{o}{m}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{e}' ( delete unset test1 ([] '{o}{n}' atlimit <- '{o}{m}{a}{s}{t}') ) ) ) define step_5f as ( do ( ['{y}{e}{s}{t}{e}'] delete unset test1 [] substring atlimit among ( '{p}' '{a}{p}' '{s}{u}{m}{p}' '{a}{s}{u}{m}{p}' '{a}{k}{a}{t}{a}{p}' '{a}{m}{e}{t}{a}{m}{f}' (<- '{y}{e}{s}{t}') ) ) ['{e}{s}{t}{e}'] delete unset test1 [] substring atlimit among ( '{a}{l}' '{a}{r}' '{e}{k}{t}{e}{l}' '{z}' '{m}' '{x}' '{p}{a}{r}{a}{k}{a}{l}' '{p}{r}{o}' '{n}{y}{s}' (<- '{y}{e}{s}{t}') ) ) define step_5g as ( do ( [substring] among ( '{i}{th}{i}{k}{a}' '{i}{th}{i}{k}{e}{s}' '{i}{th}{i}{k}{e}' (delete unset test1) ) ) [substring] among ( '{i}{k}{a}' '{i}{k}{e}{s}' '{i}{k}{e}' ( delete unset test1 ([] substring among ( '{s}{k}{oo}{l}' '{s}{k}{o}{u}{l}' '{n}{a}{r}{th}' '{s}{f}' '{o}{th}' '{p}{y}{th}' (<- '{i}{k}') )) or ([] substring atlimit among ( '{d}{y}{a}{th}' '{th}' '{p}{a}{r}{a}{k}{a}{t}{a}{th}' '{p}{r}{o}{s}{th}' '{s}{u}{n}{th}' (<- '{i}{k}') )) ) ) ) define step_5h as ( [substring] among ( '{o}{u}{s}{a}' '{o}{u}{s}{e}{s}' '{o}{u}{s}{e}' ( delete unset test1 ([] substring among ( '{p}{o}{d}{a}{r}' '{v}{l}{e}{p}' '{p}{a}{n}{t}{a}{ch}' '{f}{r}{u}{d}' '{m}{a}{n}{t}{y}{l}' '{m}{a}{l}{l}' '{k}{u}{m}{a}{t}' '{l}{a}{ch}' '{l}{i}{g}' '{f}{a}{g}' '{o}{m}' '{p}{r}{oo}{t}' (<- '{o}{u}{s}') )) or ([] substring atlimit among ( '{f}{a}{r}{m}{a}{k}' '{ch}{a}{d}' '{a}{g}{k}' '{a}{n}{a}{r}{r}' '{v}{r}{o}{m}' '{e}{k}{l}{y}{p}' '{l}{a}{m}{p}{y}{d}' '{l}{e}{ch}' '{m}' '{p}{a}{t}' '{r}' '{l}' '{m}{e}{d}' '{m}{e}{s}{a}{z}' '{u}{p}{o}{t}{e}{y}{n}' '{a}{m}' '{a}{y}{th}' '{a}{n}{i}{k}' '{d}{e}{s}{p}{o}{z}' '{e}{n}{d}{y}{a}{f}{e}{r}' '{d}{e}' '{d}{e}{u}{t}{e}{r}{e}{u}' '{k}{a}{th}{a}{r}{e}{u}' '{p}{l}{e}' '{t}{s}{a}' (<- '{o}{u}{s}') )) ) ) ) define step_5i as ( [substring] among ( '{a}{g}{a}' '{a}{g}{e}{s}' '{a}{g}{e}' ( delete unset test1 ([] '{k}{o}{l}{l}' <- '{a}{g}') or ( ([] substring among ( '{ps}{o}{f}' '{n}{a}{u}{l}{o}{ch}' () '{o}{f}' '{p}{e}{l}' '{ch}{o}{r}{t}' '{l}{l}' '{s}{f}' '{r}{p}' '{f}{r}' '{p}{r}' '{l}{o}{ch}' '{s}{m}{i}{n}' (<- '{a}{g}') )) or ([] substring atlimit among ( '{a}{v}{a}{s}{t}' '{p}{o}{l}{u}{f}' '{a}{d}{i}{f}' '{p}{a}{m}{f}' '{r}' '{a}{s}{p}' '{a}{f}' '{a}{m}{a}{l}' '{a}{m}{a}{l}{l}{y}' '{a}{n}{u}{s}{t}' '{a}{p}{e}{r}' '{a}{s}{p}{a}{r}' '{a}{ch}{a}{r}' '{d}{e}{r}{v}{e}{n}' '{d}{r}{o}{s}{o}{p}' '{x}{e}{f}' '{n}{e}{o}{p}' '{n}{o}{m}{o}{t}' '{o}{l}{o}{p}' '{o}{m}{o}{t}' '{p}{r}{o}{s}{t}' '{p}{r}{o}{s}{oo}{p}{o}{p}' '{s}{u}{m}{p}' '{s}{u}{n}{t}' '{t}' '{u}{p}{o}{t}' '{ch}{a}{r}' '{a}{e}{y}{p}' '{a}{y}{m}{o}{s}{t}' '{a}{n}{u}{p}' '{a}{p}{o}{t}' '{a}{r}{t}{y}{p}' '{d}{y}{a}{t}' '{e}{n}' '{e}{p}{y}{t}' '{k}{r}{o}{k}{a}{l}{o}{p}' '{s}{y}{d}{i}{r}{o}{p}' '{l}' '{n}{a}{u}' '{o}{u}{l}{a}{m}' '{o}{u}{r}' '{p}' '{t}{r}' '{m}' (<- '{a}{g}') )) ) ) ) ) define step_5j as ( [substring] among ( '{i}{s}{e}' '{i}{s}{o}{u}' '{i}{s}{a}' (delete unset test1) ) [] substring atlimit among ( '{n}' '{ch}{e}{r}{s}{o}{n}' '{d}{oo}{d}{e}{k}{a}{n}' '{e}{r}{i}{m}{o}{n}' '{m}{e}{g}{a}{l}{o}{n}' '{e}{p}{t}{a}{n}' (<- '{i}{s}') ) ) define step_5k as ( [substring] among ( '{i}{s}{t}{e}' (delete unset test1) ) [] substring atlimit among ( '{a}{s}{v}' '{s}{v}' '{a}{ch}{r}' '{ch}{r}' '{a}{p}{l}' '{a}{e}{y}{m}{n}' '{d}{u}{s}{ch}{r}' '{e}{u}{ch}{r}' '{k}{o}{y}{n}{o}{ch}{r}' '{p}{a}{l}{y}{m}{ps}' (<- '{i}{s}{t}') ) ) define step_5l as ( [substring] among ( '{o}{u}{n}{e}' '{i}{s}{o}{u}{n}{e}' '{i}{th}{o}{u}{n}{e}' (delete unset test1) ) [] substring atlimit among ( '{n}' '{r}' '{s}{p}{y}' '{s}{t}{r}{a}{v}{o}{m}{o}{u}{t}{s}' '{k}{a}{k}{o}{m}{o}{u}{t}{s}' '{e}{x}{oo}{n}' (<- '{o}{u}{n}') ) ) define step_5m as ( [substring] among ( '{o}{u}{m}{e}' '{i}{s}{o}{u}{m}{e}' '{i}{th}{o}{u}{m}{e}' (delete unset test1) ) [] substring atlimit among ( '{p}{a}{r}{a}{s}{o}{u}{s}' '{f}' '{ch}' '{oo}{r}{y}{o}{p}{l}' '{a}{z}' '{a}{l}{l}{o}{s}{o}{u}{s}' '{a}{s}{o}{u}{s}' (<- '{o}{u}{m}') ) ) define step_6 as ( do ( [substring] among ( '{m}{a}{t}{a}' '{m}{a}{t}{oo}{n}' '{m}{a}{t}{o}{s}' (<- '{m}{a}') ) ) test1 [substring] among ( '{a}' '{a}{g}{a}{t}{e}' '{a}{g}{a}{n}' '{a}{e}{y}' '{a}{m}{a}{y}' '{a}{n}' '{a}{s}' '{a}{s}{a}{y}' '{a}{t}{a}{y}' '{a}{oo}' '{e}' '{e}{y}' '{e}{y}{s}' '{e}{y}{t}{e}' '{e}{s}{a}{y}' '{e}{s}' '{e}{t}{a}{y}' '{y}' '{y}{e}{m}{a}{y}' '{y}{e}{m}{a}{s}{t}{e}' '{y}{e}{t}{a}{y}' '{y}{e}{s}{a}{y}' '{y}{e}{s}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{a}{n}' '{y}{o}{m}{o}{u}{n}' '{y}{o}{m}{o}{u}{n}{a}' '{y}{o}{n}{t}{a}{n}' '{y}{o}{n}{t}{o}{u}{s}{a}{n}' '{y}{o}{s}{a}{s}{t}{a}{n}' '{y}{o}{s}{a}{s}{t}{e}' '{y}{o}{s}{o}{u}{n}' '{y}{o}{s}{o}{u}{n}{a}' '{y}{o}{t}{a}{n}' '{y}{o}{u}{m}{a}' '{y}{o}{u}{m}{a}{s}{t}{e}' '{y}{o}{u}{n}{t}{a}{y}' '{y}{o}{u}{n}{t}{a}{n}' '{i}' '{i}{d}{e}{s}' '{i}{d}{oo}{n}' '{i}{th}{e}{y}' '{i}{th}{e}{y}{s}' '{i}{th}{e}{y}{t}{e}' '{i}{th}{i}{k}{a}{t}{e}' '{i}{th}{i}{k}{a}{n}' '{i}{th}{o}{u}{n}' '{i}{th}{oo}' '{i}{k}{a}{t}{e}' '{i}{k}{a}{n}' '{i}{s}' '{i}{s}{a}{n}' '{i}{s}{a}{t}{e}' '{i}{s}{e}{y}' '{i}{s}{e}{s}' '{i}{s}{o}{u}{n}' '{i}{s}{oo}' '{o}' '{o}{y}' '{o}{m}{a}{y}' '{o}{m}{a}{s}{t}{a}{n}' '{o}{m}{o}{u}{n}' '{o}{m}{o}{u}{n}{a}' '{o}{n}{t}{a}{y}' '{o}{n}{t}{a}{n}' '{o}{n}{t}{o}{u}{s}{a}{n}' '{o}{s}' '{o}{s}{a}{s}{t}{a}{n}' '{o}{s}{a}{s}{t}{e}' '{o}{s}{o}{u}{n}' '{o}{s}{o}{u}{n}{a}' '{o}{t}{a}{n}' '{o}{u}' '{o}{u}{m}{a}{y}' '{o}{u}{m}{a}{s}{t}{e}' '{o}{u}{n}' '{o}{u}{n}{t}{a}{y}' '{o}{u}{n}{t}{a}{n}' '{o}{u}{s}' '{o}{u}{s}{a}{n}' '{o}{u}{s}{a}{t}{e}' '{u}' '{u}{s}' '{oo}' '{oo}{n}' (delete) ) ) define step_7 as ( [substring] among ( '{e}{s}{t}{e}{r}' '{e}{s}{t}{a}{t}' '{o}{t}{e}{r}' '{o}{t}{a}{t}' '{u}{t}{e}{r}' '{u}{t}{a}{t}' '{oo}{t}{e}{r}' '{oo}{t}{a}{t}' (delete) ) ) ) define stem as ( backwards ( do tolower has_min_length set test1 do step_1 do step_s1 do step_s2 do step_s3 do step_s4 do step_s5 do step_s6 do step_s7 do step_s8 do step_s9 do step_s10 do step_2a do step_2b do step_2c do step_2d do step_3 do step_4 do step_5a do step_5b do step_5c do step_5d do step_5e do step_5f do step_5g do step_5h do step_5j do step_5i do step_5k do step_5l do step_5m do step_6 do step_7 ) ) snowball-3.1.0/algorithms/hindi.sbl000066400000000000000000000226621520373054300173140ustar00rootroot00000000000000// An implementation of "A Lightweight Stemmer for Hindi": // http://www.kbcs.in/downloads/papers/StmmerHindi.pdf externals ( stem ) stringescapes {} // The transliteration scheme used for our stringdefs matches that used in the // paper, as documented in the appendix. It appears to match the WX notation // (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently // uses 'z' for Anunasika whereas the paper uses Mh. // // We discriminate dependent vowels by adding a leading "_" to their stringdef // names (mnemonic: the _ signifies removing the implicit a from the preceding // character). // Vowels and sonorants: stringdef a '{U+0905}' stringdef A '{U+0906}' stringdef i '{U+0907}' stringdef I '{U+0908}' stringdef u '{U+0909}' stringdef U '{U+090A}' stringdef q '{U+090B}' stringdef e '{U+090F}' stringdef E '{U+0910}' stringdef o '{U+0913}' stringdef O '{U+0914}' // Vowel signs: stringdef _A '{U+093E}' stringdef _i '{U+093F}' stringdef _I '{U+0940}' stringdef _u '{U+0941}' stringdef _U '{U+0942}' stringdef _q '{U+0943}' stringdef _e '{U+0947}' stringdef _E '{U+0948}' stringdef _o '{U+094B}' stringdef _O '{U+094C}' // Diacritics: stringdef M '{U+0902}' stringdef H '{U+0903}' stringdef Mh '{U+0901}' stringdef Z '{U+093C}' // Nukta stringdef virama '{U+094D}' // Velar consonants: stringdef k '{U+0915}' stringdef K '{U+0916}' stringdef g '{U+0917}' stringdef G '{U+0918}' stringdef f '{U+0919}' // Palatal consonants: stringdef c '{U+091A}' stringdef C '{U+091B}' stringdef j '{U+091C}' stringdef J '{U+091D}' stringdef F '{U+091E}' // Retroflex consonants: stringdef t '{U+091F}' stringdef T '{U+0920}' stringdef d '{U+0921}' stringdef D '{U+0922}' stringdef N '{U+0923}' // Dental consonants: stringdef w '{U+0924}' stringdef W '{U+0925}' stringdef x '{U+0926}' stringdef X '{U+0927}' stringdef n '{U+0928}' // Labial consonants: stringdef p '{U+092A}' stringdef P '{U+092B}' stringdef b '{U+092C}' stringdef B '{U+092D}' stringdef m '{U+092E}' // Semi-vowels: stringdef y '{U+092F}' stringdef r '{U+0930}' stringdef l '{U+0932}' stringdef v '{U+0935}' // Fricatives: stringdef S '{U+0936}' stringdef R '{U+0937}' stringdef s '{U+0938}' stringdef h '{U+0939}' stringdef lY '{U+0933}' // Precomposed characters - letters + nukta: stringdef nZ '{U+0929}' // ≡ {n}{Z} stringdef rZ '{U+0931}' // ≡ {r}{Z} stringdef lYZ '{U+0934}' // ≡ {lY}{Z} stringdef kZ '{U+0958}' // ≡ {k}{Z} stringdef KZ '{U+0959}' // ≡ {K}{Z} stringdef gZ '{U+095A}' // ≡ {g}{Z} stringdef jZ '{U+095B}' // ≡ {j}{Z} stringdef dZ '{U+095C}' // ≡ {d}{Z} stringdef DZ '{U+095D}' // ≡ {D}{Z} stringdef PZ '{U+095E}' // ≡ {P}{Z} stringdef yZ '{U+095F}' // ≡ {y}{Z} groupings ( consonant ) routines ( CONSONANT ) define consonant '{k}{K}{g}{G}{f}' + '{c}{C}{j}{J}{F}' + '{t}{T}{d}{D}{N}' + '{w}{W}{x}{X}{n}' + '{p}{P}{b}{B}{m}' + '{y}{r}{l}{v}' + '{S}{R}{s}{h}' + '{lY}' + '{Z}' + // Nukta // Precomposed characters - letter and nukta: '{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}' backwardmode ( define CONSONANT as ( consonant ) ) define stem as ( // We assume in this implementation that the whole word doesn't count // as a valid suffix to remove, so we remove the longest suffix from // the list which leaves at least one character. This change affects // 47 words out of the 65,140 in the sample vocabulary from Hindi // wikipedia. // // The trick here is we use `next` in forward mode to advance the cursor // to the second character, then `backwards` swaps the cursor and limit. next backwards ( [substring] among ( // The list below is derived from figure 3 in the paper. // // We perform the stemming on the Devanagari characters rather than // transliterating to Latin, so we have adapted the list below to // reflect this by converting suffixes back to Devanagari as // follows: // // * within the suffixes, "a" after a consonant is dropped since // consonants have an implicit "a". // // * within the suffixes, a vowel other than "a" after a consonant // is a dependent vowel (vowel sign); a vowel (including "a") // after a non-consonant is an independent vowel. // // * to allow the vowel at the start of each suffix being dependent // or independent, we include each suffix twice. For the // dependent version, a leading "a" is dropped and we check that // the suffix is preceded by a consonant (which will have an // implicit "a"). // // * we add '{a}', which is needed for the example given right at // the end of section 5 to work (conflating BarawIya and // BarawIyawA), and which 3.1 a.v strongly suggests should be in // the list: // // Thus, the following suffix deletions (longest possible // match) are required to reduce inflected forms of masculine // nouns to a common stem: // a A i [...] // // Adding '{a}' only affect 2 words out of the 65,140 in the // sample vocabulary. // // * The transliterations of our stems would end with "a" when our // stems end in a consonant, so we also include {virama} in the // list of suffixes to remove (this affects 222 words from the // sample vocabulary). // // We've also assumed that Mh in the suffix list always means {Mh} // and never {M}{h}{virama}. Only one of the 65,140 words in the // sample vocabulary stems differently due to this (and that word // seems to be a typo). '{virama}' '{a}' '{A}' '{i}' '{I}' '{u}' '{U}' '{e}' '{o}' '{e}{M}' '{o}{M}' '{A}{M}' '{u}{A}{M}' '{u}{e}{M}' '{u}{o}{M}' '{A}{e}{M}' '{A}{o}{M}' '{i}{y}{_A}{M}' '{i}{y}{_o}{M}' '{A}{i}{y}{_A}{M}' '{A}{i}{y}{_o}{M}' '{A}{Mh}' '{i}{y}{_A}{Mh}' '{A}{i}{y}{_A}{Mh}' '{a}{w}{_A}{e}{M}' '{a}{w}{_A}{o}{M}' '{a}{n}{_A}{e}{M}' '{a}{n}{_A}{o}{M}' '{a}{w}{_A}' '{a}{w}{_I}' '{I}{M}' '{a}{w}{_I}{M}' '{a}{w}{_e}' '{A}{w}{_A}' '{A}{w}{_I}' '{A}{w}{_I}{M}' '{A}{w}{_e}' '{a}{n}{_A}' '{a}{n}{_I}' '{a}{n}{_e}' '{A}{n}{_A}' '{A}{n}{_e}' '{U}{M}{g}{_A}' '{U}{M}{g}{_I}' '{A}{U}{M}{g}{_A}' '{A}{U}{M}{g}{_I}' '{e}{M}{g}{_e}' '{e}{M}{g}{_I}' '{A}{e}{M}{g}{_e}' '{A}{e}{M}{g}{_I}' '{o}{g}{_e}' '{o}{g}{_I}' '{A}{o}{g}{_e}' '{A}{o}{g}{_I}' '{e}{g}{_A}' '{e}{g}{_I}' '{A}{e}{g}{_A}' '{A}{e}{g}{_I}' '{A}{y}{_A}' '{A}{e}' '{A}{I}' '{A}{I}{M}' '{i}{e}' '{A}{o}' '{A}{i}{e}' '{a}{k}{r}' '{A}{k}{r}' '{_A}' '{_i}' '{_I}' '{_u}' '{_U}' '{_e}' '{_o}' '{_e}{M}' '{_o}{M}' '{_A}{M}' '{_u}{A}{M}' '{_u}{e}{M}' '{_u}{o}{M}' '{_A}{e}{M}' '{_A}{o}{M}' '{_i}{y}{_A}{M}' '{_i}{y}{_o}{M}' '{_A}{i}{y}{_A}{M}' '{_A}{i}{y}{_o}{M}' '{_A}{Mh}' '{_i}{y}{_A}{Mh}' '{_A}{i}{y}{_A}{Mh}' '{_I}{M}' '{_A}{w}{_A}' '{_A}{w}{_I}' '{_A}{w}{_I}{M}' '{_A}{w}{_e}' '{_A}{n}{_A}' '{_A}{n}{_e}' '{_U}{M}{g}{_A}' '{_U}{M}{g}{_I}' '{_A}{U}{M}{g}{_A}' '{_A}{U}{M}{g}{_I}' '{_e}{M}{g}{_e}' '{_e}{M}{g}{_I}' '{_A}{e}{M}{g}{_e}' '{_A}{e}{M}{g}{_I}' '{_o}{g}{_e}' '{_o}{g}{_I}' '{_A}{o}{g}{_e}' '{_A}{o}{g}{_I}' '{_e}{g}{_A}' '{_e}{g}{_I}' '{_A}{e}{g}{_A}' '{_A}{e}{g}{_I}' '{_A}{y}{_A}' '{_A}{e}' '{_A}{I}' '{_A}{I}{M}' '{_i}{e}' '{_A}{o}' '{_A}{i}{e}' '{_A}{k}{r}' /* Suffixes with a leading implicit a: */ '{w}{_A}{e}{M}' CONSONANT '{w}{_A}{o}{M}' CONSONANT '{n}{_A}{e}{M}' CONSONANT '{n}{_A}{o}{M}' CONSONANT '{w}{_A}' CONSONANT '{w}{_I}' CONSONANT '{w}{_I}{M}' CONSONANT '{w}{_e}' CONSONANT '{n}{_A}' CONSONANT '{n}{_I}' CONSONANT '{n}{_e}' CONSONANT '{k}{r}' CONSONANT ) delete ) ) snowball-3.1.0/algorithms/hungarian.sbl000066400000000000000000000124001520373054300201620ustar00rootroot00000000000000/* Hungarian Stemmer Removes noun inflections */ routines ( mark_regions R1 v_ending case case_special case_other plural owned sing_owner plur_owner instrum factive undouble double ) externals ( stem ) integers ( p1 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' //a-acute stringdef e' '{U+00E9}' //e-acute stringdef i' '{U+00ED}' //i-acute stringdef o' '{U+00F3}' //o-acute stringdef o" '{U+00F6}' //o-umlaut stringdef oq '{U+0151}' //o-double acute stringdef u' '{U+00FA}' //u-acute stringdef u" '{U+00FC}' //u-umlaut stringdef uq '{U+0171}' //u-double acute define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' define mark_regions as ( $p1 = limit ( // Word start with a vowel, start R1 after: V...C v do (gopast non-v setmark p1) ) or ( // Word start with a non-vowel, start R1 after: C...V gopast v setmark p1 ) ) backwardmode ( define R1 as $p1 <= cursor define v_ending as ( [substring] R1 among( '{a'}' (<- 'a') '{e'}' (<- 'e') ) ) define double as ( test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') ) define undouble as ( next [hop 1] delete ) define instrum as( [substring] R1 among( 'al' (double) 'el' (double) ) delete undouble ) define case as ( [substring] R1 among( 'ban' 'ben' 'ba' 'be' 'ra' 're' 'nak' 'nek' 'val' 'vel' 't{o'}l' 't{oq}l' 'r{o'}l' 'r{oq}l' 'b{o'}l' 'b{oq}l' 'hoz' 'hez' 'h{o"}z' 'n{a'}l' 'n{e'}l' 'ig' 'at' 'et' 'ot' '{o"}t' '{e'}rt' 'k{e'}pp' 'k{e'}ppen' 'kor' 'ul' '{u"}l' 'v{a'}' 'v{e'}' 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' 'k{e'}nt' 'en' 'on' 'an' '{o"}n' 'n' 't' ) delete v_ending ) define case_special as( [substring] R1 among( '{e'}n' (<- 'e') '{a'}n' (<- 'a') '{a'}nk{e'}nt' (<- 'a') ) ) define case_other as( [substring] R1 among( 'astul' 'est{u"}l' (delete) 'stul' 'st{u"}l' (delete) '{a'}stul' (<- 'a') '{e'}st{u"}l' (<- 'e') ) ) define factive as( [substring] R1 among( '{a'}' (double) '{e'}' (double) ) delete undouble ) define plural as ( [substring] R1 among( '{a'}k' (<- 'a') '{e'}k' (<- 'e') '{o"}k' (delete) 'ak' (delete) 'ok' (delete) 'ek' (delete) 'k' (delete) ) ) define owned as ( [substring] R1 among ( 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) '{e'}k{e'}' (<- 'e') '{a'}k{e'}' (<- 'a') 'k{e'}' (delete) '{e'}{e'}i' (<- 'e') '{a'}{e'}i' (<- 'a') '{e'}i' (delete) '{e'}{e'}' (<- 'e') '{e'}' (delete) ) ) define sing_owner as ( [substring] R1 among( '{u"}nk' 'unk' (delete) '{a'}nk' (<- 'a') '{e'}nk' (<- 'e') 'nk' (delete) '{a'}juk' (<- 'a') '{e'}j{u"}k' (<- 'e') 'juk' 'j{u"}k' (delete) 'uk' '{u"}k' (delete) 'em' 'om' 'am' (delete) '{a'}m' (<- 'a') '{e'}m' (<- 'e') 'm' (delete) 'od' 'ed' 'ad' '{o"}d' (delete) '{a'}d' (<- 'a') '{e'}d' (<- 'e') 'd' (delete) 'ja' 'je' (delete) 'a' 'e' 'o' (delete) '{a'}' (<- 'a') '{e'}' (<- 'e') ) ) define plur_owner as ( [substring] R1 among( 'jaim' 'jeim' (delete) '{a'}im' (<- 'a') '{e'}im' (<- 'e') 'aim' 'eim' (delete) 'im' (delete) 'jaid' 'jeid' (delete) '{a'}id' (<- 'a') '{e'}id' (<- 'e') 'aid' 'eid' (delete) 'id' (delete) 'jai' 'jei' (delete) '{a'}i' (<- 'a') '{e'}i' (<- 'e') 'ai' 'ei' (delete) 'i' (delete) 'jaink' 'jeink' (delete) 'eink' 'aink' (delete) '{a'}ink' (<- 'a') '{e'}ink' (<- 'e') 'ink' 'jaitok' 'jeitek' (delete) 'aitok' 'eitek' (delete) '{a'}itok' (<- 'a') '{e'}itek' (<- 'e') 'itek' (delete) 'jeik' 'jaik' (delete) 'aik' 'eik' (delete) '{a'}ik' (<- 'a') '{e'}ik' (<- 'e') 'ik' (delete) ) ) ) define stem as ( do mark_regions backwards ( do instrum do case do case_special do case_other do factive do owned do sing_owner do plur_owner do plural ) ) snowball-3.1.0/algorithms/indonesian.sbl000066400000000000000000000170351520373054300203460ustar00rootroot00000000000000// An implementation of the "Porter Stemmer for Bahasa Indonesia" from: // http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf integers ( // The paper defines measure as the number of vowels in the word. We // count this initially, then adjust the count each time we remove a // prefix or suffix. measure // Numeric code for the type of prefix removed: // // 0 other/none // 1 'di' or 'meng' or 'ter' // 2 'per' // 3 'ke' or 'peng' // 4 'ber' // // Some of these have variant forms, so e.g. "meng" includes "men", "me", // "meny", "mem". // // Note that the value of prefix is only used in remove_suffix (and // routines it calls) so we don't need to worry about // remove_second_order_prefix overwriting a value of prefix set by // remove_first_order_prefix since remove_suffix gets called between // the two. prefix ) groupings ( vowel ) routines ( remove_particle remove_possessive_pronoun remove_first_order_prefix remove_second_order_prefix remove_suffix ) externals ( stem ) stringescapes {} backwardmode ( define remove_particle as ( [substring] among ( 'kah' 'lah' 'pun' (delete $measure-=1) ) ) define remove_possessive_pronoun as ( [substring] among ( 'ku' 'mu' 'nya' (delete $measure-=1) ) ) define remove_suffix as ( [substring] among ( 'an' ( ( // prefix not in {ke, peng, per}. [See SUFFIX_KAN_NOTE] ($prefix != 3 and $prefix != 2) // Remove suffix 'kan' 'k'] ) or ( // prefix not in {di, meng, ter} $prefix != 1 ) ) 'i' ( // prefix not in {ke, peng, ber} $prefix <= 2 // word does not end '-si'. [See SUFFIX_I_NOTE] not 's' ) ) delete $measure-=1 ) // SUFFIX_KAN_NOTE: // On page 29, the example "kompas Q.31" says "Both Nazief and Porter // stemmer converted the word peledakan (blast, explotion [sic]) to // ledak (to blast, to explode)". However, the algorithm as described // doesn't behave in this way - grammatically the prefix pe- occurs as a // variation of both the first-order derivational prefix peng- and the // second-order derivational prefix per-, but table 2.5 doesn't include // "pe", only table 2.6 does, so "peledakan" is handled (incorrectly) // as having prefix "per" not "peng", and so we remove derivational // suffix "kan" rather than "an" to give stem leda. (Porter-style // stemmers remove the longest suffix they can amongst those available, // which this paper notes in the last paragraph on page 15). // // We resolve this by amending the condition on suffix "kan" to // "prefix ∉ {ke, peng, per}", which seems to make the stemmer's // behaviour match all the examples in the paper except for one: // "perbaikan" is shown in table 3.4 as stemming to "bai", but with // this change it now stems to "baik". The table notes that "baik" is // the actual root so this deviation is an improvement. In a sample // vocabulary derived from the most common words in id.wikipedia.org, // this change only affects 0.12% of words (76 out of 64,587, including // "peledakan" and "perbaikan"). // SUFFIX_I_NOTE: // The rest of the condition from the paper is: // V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i // // The meaning of this is unclear in several ways, and none of the // examples given of the stemmer's behaviour in the paper help to // resolve these issues. // // Notice that c₂ isn't actually used - the most obvious explanation // seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁". // // Elsewhere the paper defines V... as meaning "the stem starts with // a vowel" and K... as meaning "the stem starts with a consonant". // The meaning of | isn't actually defined, but clearly means // alternation. // // However nowhere is the precedence of | vs ... defined, and there // isn't a standard precedence we could reasonably assume. In other // places where the paper says X|Y... it seems the | binds more // tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit // odd as the first letter must be either a vowel or a consonant, so // that really just means "ends cᵢcⱼ" (and has at least one letter // before cᵢ but we only call remove_suffix if $measure > 2 which // ensures that part). However, nowhere in the paper uses or defines // a notation such as ...X, which may explain this seemingly redundant // way of specifying this. // // The conditions elsewhere on prefix removal (e.g. V...) are clearly // on the stem left after the prefix is removed. None of the other // rules for suffix removal have conditions on the stem, but for // consistency with the prefix rules we might expect that the cᵢcⱼ // test is on what's left *after* removing the "i" suffix. // // Studying Indonesian wordlists and discussion with a native // speaker leads us to conclude that the purpose of this check is to // protect words of foreign origin (e.g. "televisi", "organisasi", // "komunikasi") from stemming, and the common feature of these is // that the word ends "-si", so we conclude that the condition here // should be read as "word does not end -si", and this is what we // have implemented. ) define vowel 'aeiou' define remove_first_order_prefix as ( [substring] among ( 'di' 'meng' 'me' 'ter' (delete $prefix=1 $measure-=1) 'men' ( ('y' test vowel ] <-'s' $prefix=1 $measure-=1) or (delete $prefix=1 $measure-=1) ) 'ke' 'peng' (delete $prefix=3 $measure-=1) 'pen' ( ('y' test vowel ] <-'s' $prefix=3 $measure-=1) or (delete $prefix=3 $measure-=1) ) 'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete) 'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete) ) ) define remove_second_order_prefix as ( // The paper has the condition on removal of prefix "bel" and "pel" as // just "ajar" not "ajar..." but it seems that the latter must be what // is intended so that e.g. "pelajaran" stems to "ajar" not "lajar". // This change only affects a very small number of words (11 out of // 64,587) and only for the better. [among ( 'pe' ( ( 'r'] $prefix=2 ) or ( 'l'] 'ajar' ) or ( ] $prefix=2 ) ) 'be' ( ( 'r'] ) or ( 'l'] 'ajar') or ( ] non-vowel 'er' ) $prefix=4 ) ) // All prefixes we remove here contain exactly one vowel. $measure-=1 delete ) define stem as ( $measure = 0 do ( repeat ( gopast vowel $measure+=1 ) ) $measure > 2 $prefix = 0 backwards ( do remove_particle $measure > 2 do remove_possessive_pronoun ) $measure > 2 test ( remove_first_order_prefix do ( test ($measure > 2 backwards remove_suffix) $measure > 2 remove_second_order_prefix ) ) or ( do remove_second_order_prefix do ($measure > 2 backwards remove_suffix) ) ) snowball-3.1.0/algorithms/irish.sbl000066400000000000000000000047471520373054300173430ustar00rootroot00000000000000routines ( R1 R2 RV initial_morph mark_regions noun_sfx deriv verb_sfx ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* Accented characters */ stringdef a' '{U+00E1}' // a-acute stringdef e' '{U+00E9}' // e-acute stringdef i' '{U+00ED}' // i-acute stringdef o' '{U+00F3}' // o-acute stringdef u' '{U+00FA}' // u-acute define v 'aeiou{a'}{e'}{i'}{o'}{u'}' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( gopast v setmark pV gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define initial_morph as ( [substring] among ( 'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic (delete) // verbs 'd{'}' (delete) 'd{'}fh' (<- 'f') // other contractions 'm{'}' 'b{'}' (delete) 'sh' (<- 's') 'mb' (<- 'b') 'gc' (<- 'c') 'nd' (<- 'd') 'bhf' (<- 'f') 'ng' (<- 'g') 'bp' (<- 'p') 'ts' (<- 's') 'dt' (<- 't') // Lenition 'bh' (<- 'b') 'ch' (<- 'c') 'dh' (<- 'd') 'fh' (<- 'f') 'gh' (<- 'g') 'mh' (<- 'm') 'ph' (<- 'p') 'th' (<- 't') ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define noun_sfx as ( [substring] among ( 'amh' 'eamh' 'abh' 'eabh' 'aibh' 'ibh' 'aimh' 'imh' 'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta' (R1 delete) 'ire' 'ir{i'}' 'aire' 'air{i'}' (R2 delete) ) ) define deriv as ( [substring] among ( 'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta' (R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl 'arcacht' 'arcachta{i'}' 'arcachta' (<- 'arc') // monarcacht -> monarc 'gineach' 'gineas' 'ginis' (<- 'gin') 'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}' (<- 'graf') 'paite' 'patach' 'pataigh' 'patacha' (<- 'paite') '{o'}ideach' '{o'}ideacha' '{o'}idigh' (<- '{o'}id') ) ) define verb_sfx as ( [substring] among ( 'imid' 'aimid' '{i'}mid' 'a{i'}mid' 'faidh' 'fidh' (RV delete) 'ain' 'eadh' 'adh' '{a'}il' 'tear' 'tar' (R1 delete) ) ) ) define stem as ( do initial_morph do mark_regions backwards ( do noun_sfx do deriv do verb_sfx ) ) snowball-3.1.0/algorithms/italian.sbl000066400000000000000000000135311520373054300176350ustar00rootroot00000000000000 routines ( elisions prelude postlude mark_regions RV R1 R2 attached_pronoun standard_suffix verb_suffix vowel_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v AEIO CG ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' stringdef a` '{U+00E0}' stringdef e' '{U+00E9}' stringdef e` '{U+00E8}' stringdef i' '{U+00ED}' stringdef i` '{U+00EC}' stringdef o' '{U+00F3}' stringdef o` '{U+00F2}' stringdef u' '{U+00FA}' stringdef u` '{U+00F9}' define v 'aeiou{a`}{e`}{i`}{o`}{u`}' define elisions as ( [ substring ] not atlimit among ( // 'c{'}' doesn't seem useful to remove here. 'd{'}' // e.g. d'Italia ("of Italy") 'l{'}' // e.g. l'anno ("the year") 'm{'}' // e.g. m'ama ("he loves me") 's{'}' // e.g. s'innamora ("he falls in love") 't{'}' // e.g. t'amo ("I love you") 'v{'}' // e.g. v'adoro ("I adore you") 'all{'}' // e.g. all'università ("at university") 'dall{'}' // e.g. dall'album ("from the album") 'dell{'}' // e.g. dell'anno ("of the year") 'gl{'}' // e.g. gl'inglesi ("the English") 'nell{'}' // e.g. nell'estate ("in the summer") 'quell{'}' // e.g. quell'anno ("that year") 'quest{'}' // e.g. quest'anno ("this year") 'sull{'}' // e.g. sull'isola ("on the island") 'tutt{'}' // e.g. tutt'Europa ("all of Europe") 'un{'}' // e.g. un'eccentricità ("an eccentricity") ) delete ) define prelude as ( test repeat ( [substring] among( '{a'}' (<- '{a`}') '{e'}' (<- '{e`}') '{i'}' (<- '{i`}') '{o'}' (<- '{o`}') '{u'}' (<- '{u`}') 'qu' (<- 'qU') '' (next) ) ) repeat goto ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or 'divan' // Otherwise "divano" stems to "div" and collides with "diva". or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among( 'ci' 'gli' 'la' 'le' 'li' 'lo' 'mi' 'ne' 'si' 'ti' 'vi' // the compound forms are: 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' 'mela' 'mele' 'meli' 'melo' 'mene' 'tela' 'tele' 'teli' 'telo' 'tene' 'cela' 'cele' 'celi' 'celo' 'cene' 'vela' 'vele' 'veli' 'velo' 'vene' ) substring RV among( 'ando' 'endo' (delete) 'ar' 'er' 'ir' (<- 'e') ) ) define standard_suffix as ( [substring] among( 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' 'atrice' 'atrici' 'ante' 'anti' ( R2 delete ) 'azione' 'azioni' 'atore' 'atori' ( R2 delete try ( ['ic'] R2 delete ) ) 'logia' 'logie' ( R2 <- 'log' ) 'uzione' 'uzioni' 'usione' 'usioni' ( R2 <- 'u' ) 'enza' 'enze' ( R2 <- 'ente' ) 'amento' 'amenti' 'imento' 'imenti' ( RV delete ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' ( ['at'] R2 delete ) 'os' 'ic' 'abil' ) ) ) 'it{a`}' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'ivo' 'ivi' 'iva' 'ive' ( R2 delete try ( ['at'] R2 delete ['ic'] R2 delete ) ) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' 'ono' 'uta' 'ute' 'uti' 'uto' 'ar' 'ir' // but 'er' is problematical (delete) ) ) define AEIO 'aeio{a`}{e`}{i`}{o`}' define CG 'cg' define vowel_suffix as ( try ( [AEIO] RV delete ['i'] RV delete ) try ( ['h'] CG RV delete ) ) ) define stem as ( do elisions do prelude do mark_regions backwards ( do attached_pronoun do (standard_suffix or verb_suffix) do vowel_suffix ) do postlude ) snowball-3.1.0/algorithms/lithuanian.sbl000066400000000000000000000314711520373054300203530ustar00rootroot00000000000000externals ( stem ) // escape symbols for substituting lithuanian characters stringescapes { } /* Special characters in Unicode Latin Extended-A */ // k nosine stringdef ak '{U+0105}' // ą a + ogonek stringdef ek '{U+0119}' // ę e + ogonek stringdef ik '{U+012F}' // į i + ogonek stringdef uk '{U+0173}' // ų u + ogonek // . taskas stringdef e. '{U+0117}' // ė e + dot // - ilgoji stringdef u- '{U+016B}' // ū u + macron // v varnele stringdef cv '{U+010D}' // č c + caron (haček) stringdef sv '{U+0161}' // š s + caron (haček) stringdef zv '{U+017E}' // ž z + caron (haček) // [C](VC)^m[V|C] // definitions of variables for // p1 - position of m = 0 integers ( p1 ) // groupings // v - lithuanian vowels groupings ( v ) // v - all lithuanian vowels define v 'aeiyou{ak}{ek}{ik}{uk}{e.}{u-}' // all lithuanian stemmer routines: 4 steps routines ( step2 step1 fix_chdz fix_gd fix_conflicts ) backwardmode ( define step1 as ( setlimit tomark p1 for ([substring]) among ( // Daiktavardžiai (Nouns) // I linksniuotė (declension I) 'as' 'ias' 'is' 'ys' // vyras, kelias, brolis, gaidys 'o' 'io' // vyro, kelio 'ui' 'iui' // vyrui, keliui '{ak}' 'i{ak}' '{ik}' // vyrą, kelią, brolį 'u' 'iu' // vyru, keliu 'e' 'yje' // vyre, kelyje 'y' 'au' 'i' // kely, brolau, broli, 'an' // nusižengiman 'ai' 'iai' // vyrai, keliai '{uk}' 'i{uk}' // vyrų, kelių 'ams' 'am' // vyrams, vyram 'iams' 'iam' // broliams, broliam 'us' 'ius' // vyrus, brolius 'ais' 'iais' // vyrais, keliais 'uose' 'iuose' 'uos' 'iuos' // vyruose, keliuose, vyruos, keliuos 'uosna' 'iuosna' // vyruosna, keliuosna 'ysna' // žutysna 'asis' 'aisi' // sukimasis, sukimaisi 'osi' '{uk}si' // sukimosi, sukimųsi 'uisi' // sukimuisi '{ak}si' // sukimąsi 'usi' // sukimusi 'esi' // sukimesi 'uo' // mėnuo // II linksniuote (declension II) 'a' 'ia' // galva, vysnios 'os' 'ios' // galvos, vysnios 'oj' 'oje' 'ioje' // galvoje, vysnioje 'osna' 'iosna' // galvosna, vyšniosna 'om' 'oms' 'ioms' // galvoms, vysnioms 'omis' 'iomis' // galvomis, vysniomis 'ose' 'iose' // galvose, vysniose 'on' 'ion' // galvon, vyšnion // III linksniuote (declension III) '{e.}' // gervė '{e.}s' // gervės 'ei' // gervei '{ek}' // gervę '{e.}j' '{e.}je' // gervėj, gervėje '{e.}ms' // gervėms 'es' // gerves '{e.}mis' // gervėmis '{e.}se' // gervėse '{e.}sna' // gervėsna '{e.}n' // žydaitėn // IV linksniuote (declension IV) 'aus' 'iaus' // sūnaus, skaičiaus 'umi' 'iumi' // sūnumi, skaičiumi 'uje' 'iuje' // sūnuje, skaičiuje 'iau' // skaičiau '{u-}s' // sūnūs 'ums' // sūnums 'umis' // sūnumis 'un' 'iun' // sūnun, administratoriun // V linksniuote (declension V) 'ies' 'ens' 'enio' // avies, vandens, sesers 'eniui' // vandeniui 'en{ik}' // vandenį 'imi' 'eniu' // avimi, vandeniu 'enyje' // vandenyje 'ie' 'enie' // avie, vandenide 'enys' // vandenys // 'en{uk}' konfliktas su 'žandenų' 'antenų' 'ims' 'enims' // avims, vandemins 'enis' // vandenis 'imis' // žebenkštimis 'enimis' // vandenimis 'yse' 'enyse' // avyse, vandenyse // Būdvardžiai (Adjectives) // (i)a linksniuotė 'iem' 'iems' // geriem, geriems 'ame' 'iame' // naujame, mediniame // Veiksmažodžiai (Verbs) // Tiesioginė nuosaka (indicative mood) // esamasis laikas (present tense) // (i)a asmenuotė (declension (i)a) 'uosi' 'iuosi' // dirbuosi, traukiuosi 'iesi' // dirbiesi 'asi' 'iasi' // dirbasi, traukiasi 'am{e.}s' 'iam{e.}s' // dirbamės, traukiamės 'at' 'ate' 'iat' 'iate' // dirbat, dirbate, ariat, traukiate 'at{e.}s' 'iat{e.}s' // dirbatės, traukiatės // i asmenuotė (declension i) 'isi' // tikisi 'im' // mylim // 'ime' konfliktassu daiktavardžiu vietininku, pvz. 'gėrime' 'im{e.}s' // tikimės 'it' 'ite' // mylit, mylite, tikitės // 'it{e.}s' konfliktas su priesaga ir dgs. vardininko galūne -ait-ės pvz. žydaitės // o asmenuotė (declension o) 'ome' // mokome 'ot' 'ote' // mokot, mokote // būtasis laikas // o asmenuotė (declension o) '{e.}jo' '{e.}josi' // tikėjo, tikėjosi 'ot{e.}s' // tikėjotės/bijotės // ė asmenuotė (declension ė) 'eisi' // mokeisi '{e.}si' // mokėsi '{e.}m' '{e.}me' // mokėm, mokėme '{e.}m{e.}s' // mokėmės '{e.}t' '{e.}te' // mokėt, mokėte '{e.}t{e.}s' // mokėtės // būtasis dažninis laikas (frequentative past tense) 'ausi' // mokydavausi 'om{e.}s' // mokydavomės/bijomės // būsimasis laikas (future tense) 'siu' 'siuosi' // dirbsiu, mokysiuosi 'si' 'siesi' // dirbsi, dirbsiesi 's' 'ysis' // dirbs, mokysis 'sim' 'sime' // dirbsim, dirbsime 'sit' 'site' // gersit, gersite // tariamoji nuosaka (subjunctive mood) '{cv}iau' '{cv}iausi' // dirbčiau 'tum' 'tumei' // dirbtum, dirbtumei 'tumeis' 'tumeisi' // mokytumeis, mokytumeisi // 't{uk}' nes blogai batutų -> batų 't{uk}si' // mokytųsi // 'tume' konfliktas su 'šventume' 'tum{e.}m' // dirbtumėm 'tum{e.}me' // dirbtumėme 'tum{e.}m{e.}s' // mokytumėmės 'tute' 'tum{e.}t' // dirbtute, dirbtumėt 'tum{e.}te' // dirbtumėte 'tum{e.}t{e.}s' // mokytumėtės // liepiamoji nuosaka (imperative mood) 'k' 'ki' // dirbk, dirbki, mokykis // 'kis' konfliktas viln-išk-is // 'kime' konfliktas, nes pirkime 'kim{e.}s' // mokykimės // bendratis (infinitive) 'uoti' 'iuoti' // meluoti, dygsniuoti 'auti' 'iauti' // draugauti, girtuokliauti 'oti' 'ioti' // dovanoti, meškerioti '{e.}ti' // auklėti 'yti' // akyti 'inti' // auginti 'in{e.}ti' // blusinėti 'enti' // gyventi 'tel{e.}ti' // bumbtelėti 'ter{e.}ti' // bumbterėti 'ti' // skalbti // 'tis' konfliktas, nes rytme-tis -> rytme // dalyviai (participles) '{ak}s' 'i{ak}s' '{ik}s' // dirbąs, žaidžiąs, gulįs 't{uk}s' // suktųs -> suk 'sim{e.}s' // suksimės 'sit{e.}s' // suksitės 'kite' // supkite ) delete ) define step2 as repeat ( setlimit tomark p1 for ([substring]) among ( // daiktavardziu priesagos (Noun suffixes) // budvardziu priesagos (Adjective suffixes) // 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is 'ing' // tvark-ing-as 'i{sv}k' // lenk-išk-as '{e.}t' // dem-ėt-as 'ot' // garban-ot-as 'uot' 'iuot' // lang-uot-as, akin-iuot-as // 'tin', nes augintinis // dirb-tin-is // 'ut', nes batutas, degutas etc. // maž-ut-is 'yt' // maž-yt-is 'iuk' // maž-iuk-as 'iul' // maž-ul-is '{e.}l' // maž-ėl-is 'yl' // maž-yl-is 'u{cv}iuk' // maž-učiuk-as 'uliuk' // maž-uliuk-as 'ut{e.}ait' // maž-utėlait-is 'ok' // did-ok-as 'iok' // višč-iok-as 'sv' '{sv}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as 'op' 'iop' // dvej-op-as, viener-iop-as 'ain' // apval-ain-as 'yk{sv}t' 'yk{sv}{cv}' // ten-ykšt-is, vakar-ykšč-ias // laisniai 'esn' // did-esn-is 'aus' 'iaus' // nauj-aus-ias, ger-iaus-ias // ivardziuotiniai budvardziai (Pronominal adjectives) // vyriska gimine (Male gender) 'ias' // žaliasis 'oj' 'ioj' // gerojo, žaliojo 'aj' 'iaj' // gerajam, žaliajam '{ak}j' 'i{ak}j' // garąjį, žaliąjį 'uoj' 'iuoj' // geruoju, žaliuoju 'iej' // gerieji '{uk}j' 'i{uk}j' // gerųjų, žaliųjų 'ies' // geriesiems 'uos' 'iuos' // geruosius, žaliuosius 'ais' 'iais' // geraisiais, žaliaisiais // moteriska gimine (Female gender) 'os' 'ios' // gerosios, žaliosios '{ak}s' 'i{ak}s' // gerąsios, žaliąsias // būtasis dažninis laikas (frequentative past tense) 'dav' // ei-dav-o // dalyvių priesagos (particple suffix) 'ant' 'iant' 'int' // tur-int-is '{e.}j' // tur-ėj-o '{ek}' // '{e.}j{ek}' '{ek}s' // dirb-ęs-is 'siant' // dirb-siant // pusdalyviai (participle) 'dam' // bėg-dam-as 'auj' // ūkinink-auj-a 'jam' 'iau' 'am' // baiminim-ams-i ) delete ) define fix_conflicts as ( [substring] among ( // 'lietuvaite' -> 'lietuvaitė', konfliktas su 'myl-ite' 'aite' (<-'ait{e.}') // 'lietuvaitės' -> 'lietuvaitė', konfliktas su 'myl-itės' 'ait{e.}s' (<-'ait{e.}') // ''ūs-uotės' -> 'ūs-uotė', konfliktas 'mokotės' 'uot{e.}s' (<-'uot{e.}') // ''ūs-uote' -> 'ūs-uotė', konfliktas 'mokote' 'uote' (<-'uot{e.}') // 'žerėjime' -> 'žėrėjimas', konfliktas su 'žais-ime' '{e.}jime' (<-'{e.}jimas') // 'žvilgesiu' -> 'žvilgesys', konfliktas su 'dirb-siu' 'esiu' (<-'esys') // 'duobkasiu' -> 'duobkasys', konfliktas su 'pakasiu' 'asius' (<-'asys') // 'žioravime' -> 'žioravimas', konfliktas su 'myl-ime' 'avime' (<-'avimas') 'ojime' (<-'ojimas') // 'advokatės' -> 'advokatė', konfliktas su 'dirb-atės' 'okat{e.}s' (<-'okat{e.}') // 'advokate' -> 'advokatė', konfliktas su 'dirb-ate' 'okate' (<-'okat{e.}') ) ) define fix_chdz as ( [substring] among ( '{cv}' (<-'t') 'd{zv}' (<-'d') ) ) define fix_gd as ( [substring] among ( 'gd' (<-'g') // '{e.}k' (<-'{e.}g') ) ) ) define stem as ( $p1 = limit do ( // priešdėlis 'a' ilgeniuose nei 6 raidės žodžiuose, pvz. 'a-liejus'. try ('a' $(len > 6)) gopast v gopast non-v setmark p1 ) backwards ( do fix_conflicts do step1 do fix_chdz do step2 do fix_chdz do fix_gd ['{'}'] delete ) ) snowball-3.1.0/algorithms/lovins.sbl000066400000000000000000000200021520373054300175150ustar00rootroot00000000000000 stringescapes {} routines ( A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC endings undouble respell ) externals ( stem ) backwardmode ( /* Lovins' conditions A, B ... CC, as given in her Appendix B, where a test for a two letter prefix ('test hop 2') is implicitly assumed. Note that 'e' next 'u' corresponds to her u*e because Snowball is scanning backwards. */ define A as ( hop 2 ) define B as ( hop 3 ) define C as ( hop 4 ) define D as ( hop 5 ) define E as ( test hop 2 not 'e' ) define F as ( test hop 3 not 'e' ) define G as ( test hop 3 'f' ) define H as ( test hop 2 't' or 'll' ) define I as ( test hop 2 not 'o' not 'e' ) define J as ( test hop 2 not 'a' not 'e' ) define K as ( test hop 3 'l' or 'i' or ('e' next 'u') ) define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') ) define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' ) define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) ) define O as ( test hop 2 'l' or 'i' ) define P as ( test hop 2 not 'c' ) define Q as ( test hop 2 test hop 3 not 'l' not 'n' ) define R as ( test hop 2 'n' or 'r' ) define S as ( test hop 2 'dr' or ('t' not 't') ) define T as ( test hop 2 's' or ('t' not 'o') ) define U as ( test hop 2 'l' or 'm' or 'n' or 'r' ) define V as ( test hop 2 'c' ) define W as ( test hop 2 not 's' not 'u' ) define X as ( test hop 2 'l' or 'i' or ('e' next 'u') ) define Y as ( test hop 2 'in' ) define Z as ( test hop 2 not 'f' ) define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or' 'es' 't' ) ) define BB as ( test hop 3 not 'met' not 'ryst' ) define CC as ( test hop 2 'l' ) /* The system of endings, as given in Appendix A. */ define endings as ( [substring] among( 'alistically' B 'arizability' A 'izationally' B 'antialness' A 'arisations' A 'arizations' A 'entialness' A 'allically' C 'antaneous' A 'antiality' A 'arisation' A 'arization' A 'ationally' B 'ativeness' A 'eableness' E 'entations' A 'entiality' A 'entialize' A 'entiation' A 'ionalness' A 'istically' A 'itousness' A 'izability' A 'izational' A 'ableness' A 'arizable' A 'entation' A 'entially' A 'eousness' A 'ibleness' A 'icalness' A 'ionalism' A 'ionality' A 'ionalize' A 'iousness' A 'izations' A 'lessness' A 'ability' A 'aically' A 'alistic' B 'alities' A 'ariness' E 'aristic' A 'arizing' A 'ateness' A 'atingly' A 'ational' B 'atively' A 'ativism' A 'elihood' E 'encible' A 'entally' A 'entials' A 'entiate' A 'entness' A 'fulness' A 'ibility' A 'icalism' A 'icalist' A 'icality' A 'icalize' A 'ication' G 'icianry' A 'ination' A 'ingness' A 'ionally' A 'isation' A 'ishness' A 'istical' A 'iteness' A 'iveness' A 'ivistic' A 'ivities' A 'ization' F 'izement' A 'oidally' A 'ousness' A 'aceous' A 'acious' B 'action' G 'alness' A 'ancial' A 'ancies' A 'ancing' B 'ariser' A 'arized' A 'arizer' A 'atable' A 'ations' B 'atives' A 'eature' Z 'efully' A 'encies' A 'encing' A 'ential' A 'enting' C 'entist' A 'eously' A 'ialist' A 'iality' A 'ialize' A 'ically' A 'icance' A 'icians' A 'icists' A 'ifully' A 'ionals' A 'ionate' D 'ioning' A 'ionist' A 'iously' A 'istics' A 'izable' E 'lessly' A 'nesses' A 'oidism' A 'acies' A 'acity' A 'aging' B 'aical' A 'alist' A 'alism' B 'ality' A 'alize' A 'allic'BB 'anced' B 'ances' B 'antic' C 'arial' A 'aries' A 'arily' A 'arity' B 'arize' A 'aroid' A 'ately' A 'ating' I 'ation' B 'ative' A 'ators' A 'atory' A 'ature' E 'early' Y 'ehood' A 'eless' A 'elity' A 'ement' A 'enced' A 'ences' A 'eness' E 'ening' E 'ental' A 'ented' C 'ently' A 'fully' A 'ially' A 'icant' A 'ician' A 'icide' A 'icism' A 'icist' A 'icity' A 'idine' I 'iedly' A 'ihood' A 'inate' A 'iness' A 'ingly' B 'inism' J 'inity'CC 'ional' A 'ioned' A 'ished' A 'istic' A 'ities' A 'itous' A 'ively' A 'ivity' A 'izers' F 'izing' F 'oidal' A 'oides' A 'otide' A 'ously' A 'able' A 'ably' A 'ages' B 'ally' B 'ance' B 'ancy' B 'ants' B 'aric' A 'arly' K 'ated' I 'ates' A 'atic' B 'ator' A 'ealy' Y 'edly' E 'eful' A 'eity' A 'ence' A 'ency' A 'ened' E 'enly' E 'eous' A 'hood' A 'ials' A 'ians' A 'ible' A 'ibly' A 'ical' A 'ides' L 'iers' A 'iful' A 'ines' M 'ings' N 'ions' B 'ious' A 'isms' B 'ists' A 'itic' H 'ized' F 'izer' F 'less' A 'lily' A 'ness' A 'ogen' A 'ward' A 'wise' A 'ying' B 'yish' A 'acy' A 'age' B 'aic' A 'als'BB 'ant' B 'ars' O 'ary' F 'ata' A 'ate' A 'eal' Y 'ear' Y 'ely' E 'ene' E 'ent' C 'ery' E 'ese' A 'ful' A 'ial' A 'ian' A 'ics' A 'ide' L 'ied' A 'ier' A 'ies' P 'ily' A 'ine' M 'ing' N 'ion' Q 'ish' C 'ism' B 'ist' A 'ite'AA 'ity' A 'ium' A 'ive' A 'ize' F 'oid' A 'one' R 'ous' A 'ae' A 'al'BB 'ar' X 'as' B 'ed' E 'en' F 'es' E 'ia' A 'ic' A 'is' A 'ly' B 'on' S 'or' T 'um' U 'us' V 'yl' R '{'}s' A 's{'}' A 'a' A 'e' A 'i' A 'o' A 's' W 'y' B (delete) ) ) /* Undoubling is rule 1 of appendix C. */ define undouble as ( test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss' 'tt') [next] delete ) /* The other appendix C rules can be done together. */ define respell as ( [substring] among ( 'iev' (<-'ief') 'uct' (<-'uc') 'umpt' (<-'um') 'rpt' (<-'rb') 'urs' (<-'ur') 'istr' (<-'ister') 'metr' (<-'meter') 'olv' (<-'olut') 'ul' (not 'a' not 'i' not 'o' <-'l') 'bex' (<-'bic') 'dex' (<-'dic') 'pex' (<-'pic') 'tex' (<-'tic') 'ax' (<-'ac') 'ex' (<-'ec') 'ix' (<-'ic') 'lux' (<-'luc') 'uad' (<-'uas') 'vad' (<-'vas') 'cid' (<-'cis') 'lid' (<-'lis') 'erid' (<-'eris') 'pand' (<-'pans') 'end' (not 's' <-'ens') 'ond' (<-'ons') 'lud' (<-'lus') 'rud' (<-'rus') 'her' (not 'p' not 't' <-'hes') 'mit' (<-'mis') 'ent' (not 'm' <-'ens') /* 'ent' was 'end' in the 1968 paper - a typo. */ 'ert' (<-'ers') 'et' (not 'n' <-'es') 'yt' (<-'ys') 'yz' (<-'ys') ) ) ) define stem as ( backwards ( do endings do undouble do respell ) ) snowball-3.1.0/algorithms/nepali.sbl000066400000000000000000000130271520373054300174640ustar00rootroot00000000000000/* * Authors: * - Ingroj Shrestha , Nepali NLP Group * - Oleg Bartunov , Postgres Professional Ltd. * - Shreeya Singh Dhakal, Nepali NLP Group */ routines ( remove_category_1 remove_category_2 remove_category_3 ) stringescapes {} stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA externals ( stem ) backwardmode ( define remove_category_1 as( [substring] among ( '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}' '{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}' '{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}' (delete) '{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}' ('{dle}' or '{dvse}' or delete) ) ) define remove_category_2 as ( [substring] among( '{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete) '{dvsai}' ('{dlta}{dsv}{dlr}' delete) ) ) define remove_category_3 as( [substring] among( '{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}' (delete) ) ) ) define stem as ( backwards ( do remove_category_1 repeat ( do remove_category_2 remove_category_3 ) ) ) snowball-3.1.0/algorithms/norwegian.sbl000066400000000000000000000044161520373054300202070ustar00rootroot00000000000000routines ( mark_regions main_suffix consonant_pair other_suffix ) externals ( stem ) integers ( p1 ) groupings ( v s_ending ) stringescapes {} /* special characters */ stringdef ae '{U+00E6}' stringdef ao '{U+00E5}' stringdef e^ '{U+00EA}' // e-circumflex stringdef o` '{U+00F2}' // o-grave stringdef o' '{U+00F3}' // o-acute stringdef o^ '{U+00F4}' // o-circumflex stringdef o/ '{U+00F8}' define v 'ae{e^}io{o`}{o'}{o^}uy{ae}{ao}{o/}' define s_ending 'bcdfghjlmnoptvyz' define mark_regions as ( $p1 = limit do ( ( // If there's an apostrophe, start R1 after it to handle // acronym loanwords such as "pc'en" and "ep'en". gopast '{'}' ) or ( gopast v gopast non-v ) setmark p1 ) // Ensure at least 3 characters before R1. test (hop 3 do ($p1 < cursor $p1 = cursor)) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' 'hetens' 'ets' 'et' 'het' 'ast' (delete) 'ers' ( among ( 'amm' 'ast' 'ind' 'kap' 'kk' 'lt' 'nk' 'omm' 'pp' 'v' '{o/}st' () 'giv' 'hav' 'skap' '' (delete) ) ) 's' (s_ending or ('r' not 'e') or ('k' non-v) delete) 'erte' 'ert' (<-'er') ) ) define consonant_pair as ( test ( setlimit tomark p1 for ([substring]) among( 'dt' 'vt' ) ) next] delete ) define other_suffix as ( setlimit tomark p1 for ([substring]) among( 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' 'hetslov' (delete) ) ) ) define stem as ( mark_regions backwards ( do main_suffix do consonant_pair do other_suffix // Remove trailing apostrophe. ['{'}'] delete ) ) snowball-3.1.0/algorithms/persian.sbl000066400000000000000000000403551520373054300176610ustar00rootroot00000000000000/* * ----------------------------------------------------------------------------- * Persian Stemmer (HPS-like) * ----------------------------------------------------------------------------- * Based on the paper: * "HPS: A Hierarchical Persian Stemming Method" * Ayshe Rashidi, Mina Zolfy Lighvan (2014) * ----------------------------------------------------------------------------- * Differences from original HPS: * 1) No POS-tagger stage: * - HPS uses a POS tagger to route tokens to Noun, Adjective, or Verb * suffix rules directly. * - Here we fall back to the fixed order: Noun or Adjective, then Verb * when POS is unknown. * Reason: Implementation is tagger-agnostic and lightweight. * 2) Lexical "-AN" guard: * - Added `Protect_Lexical_AN` to avoid stripping true lexical endings * like "...stan", "...ran", and known stems (Iran, Tehran, Ensan). * Reason: reduces over-stemming where HPS relied on explicit hash lists. * ----------------------------------------------------------------------------- * Implemented by: https://saeiddrv.com * ----------------------------------------------------------------------------- */ stringescapes { } // ============================================================================ // Alphabet & special symbols // ============================================================================ stringdef alef '{U+0627}' stringdef aa '{U+0622}' stringdef be '{U+0628}' stringdef pe '{U+067E}' stringdef te '{U+062A}' stringdef se '{U+062B}' stringdef jim '{U+062C}' stringdef che '{U+0686}' stringdef heh_jimi '{U+062D}' stringdef khe '{U+062E}' stringdef dal '{U+062F}' stringdef zal '{U+0630}' stringdef re '{U+0631}' stringdef ze '{U+0632}' stringdef zhe '{U+0698}' stringdef sin '{U+0633}' stringdef shin '{U+0634}' stringdef sad '{U+0635}' stringdef zad '{U+0636}' stringdef ta '{U+0637}' stringdef za '{U+0638}' stringdef ain '{U+0639}' stringdef ghain '{U+063A}' stringdef fe '{U+0641}' stringdef ghaf '{U+0642}' stringdef kaf '{U+06A9}' stringdef gaf '{U+06AF}' stringdef lam '{U+0644}' stringdef mim '{U+0645}' stringdef nun '{U+0646}' stringdef vav '{U+0648}' stringdef heh '{U+0647}' stringdef ye '{U+06CC}' stringdef space '{U+0020}' stringdef zero_width_joiner '{U+200D}' stringdef half_space '{U+200C}' // zero-width non-joiner (ZWNJ) // Note: U+200C (ZERO WIDTH NON-JOINER) is commonly referred to as "nim-faseleh" ("half-space") in Persian. // Although this name is not standard in English Unicode terminology, it reflects its widespread use // in Persian as a morpheme separator within words. This character is here to normalize // Arabic variants to normalize to Persian forms stringdef ar_kaf '{U+0643}' stringdef ar_ye '{U+064A}' stringdef ar_ye_with_hamza_above '{U+0626}' stringdef ar_heh '{U+06C1}' stringdef ar_teh_marbuta '{U+0629}' stringdef ar_alef_with_hamza_above '{U+0623}' stringdef ar_alef_with_hamza_below '{U+0625}' stringdef ar_vav_with_hamza_above '{U+0624}' // ============================================================================ // Declarations (routines, flags, and marks) // ============================================================================ routines ( Normalize_Characters // Forward: Unicode/script normalization Prefixes // Forward: handles prefixes Delete_ZWNJ // Forward: delete remaining ZWNJs after prefix handling Protect_Lexical_AN // Probe: flag lexical -AN endings (per pass) R1 // Test if cursor is in R1 // Noun tier (HPS categories) AN_Exception Irregular_Noun Stem_Noun_or_Adjective Stem_Verb ) externals ( stem ) // Guard flags: // - 'saw_present_prefix': // true when a present-tense prefix (mi-/nemi-) is stripped; used as the // default for `remove_verb_person_endings` for all suffix removal passes. // - 'remove_verb_person_endings': // true if a verb cue has been seen; used to enable verb person-ending rules // in this pass. booleans ( saw_present_prefix remove_verb_person_endings ) integers ( p1 ) // ============================================================================ // PHASE 1 — NORMALIZATION (forward) // - Unify Arabic forms to Persian letters for stable downstream matching. // - Delete ZWJ (U+200D) and ASCII spaces so the token is solid. // - ZWNJ (U+200C) is preserved here; used as prefix-stem separator in Phase 2, // then deleted by Delete_ZWNJ after prefix stripping is complete. // ============================================================================ define Normalize_Characters as ( repeat ( [substring] among ( '{ar_kaf}' (<- '{kaf}') '{ar_ye}' '{ar_ye_with_hamza_above}' (<- '{ye}') '{ar_teh_marbuta}' '{ar_heh}' (<- '{heh}') '{ar_alef_with_hamza_above}' '{ar_alef_with_hamza_below}' (<- '{alef}') '{ar_vav_with_hamza_above}' (<- '{vav}') '{zero_width_joiner}' ( delete ) '{space}' ( delete ) '' ( next ) ) ) ) // ============================================================================ // PHASE 2 — PREFIX HANDLING (forward, anchored) // - HPS: handle known prefixes before suffix processing. // - Requires ZWNJ (U+200C) between prefix and stem to avoid mishandling // words that begin with the same syllables but are not prefixed forms. // - At least two characters are required after the prefix. // - Sets `saw_present_prefix` when matching {mi}/{nemi} to enable person endings later. // - Note: negating derivational prefixes na- and bi- are intentionally not // stripped as they create words with opposite meanings (e.g. na+dorost = // incorrect, bi+khatr = safe) and conflating them with their base forms // would harm search precision. // - Similarly, nemi- negates the meaning so we don't remove it but do still // set `saw_present_prefix` when it is present. // ============================================================================ define Prefixes as ( [substring] among ( '{nun}{mim}{ye}{half_space}' // NEMI‌ + ZWNJ: detect but keep (hop 2 set saw_present_prefix) '{mim}{ye}{half_space}' // MI‌ + ZWNJ (hop 2 delete set saw_present_prefix) ) ) // ============================================================================ // PHASE 2b — DELETE REMAINING ZWNJs (forward) // - After prefix detection, any remaining ZWNJs (e.g. in compound words) // are no longer needed and are removed here. // ============================================================================ define Delete_ZWNJ as ( repeat ( goto (['{half_space}'] delete) ) ) backwardmode ( define R1 as $p1 <= cursor // ============================================================================ // PROBE — PROTECT LEXICAL "-AN" (not plural) // - HPS strips -AN as a plural; however, many stems end in orthographic "AN": // ...stan / ...san / ...ran (place names/lexical stems), and frequent // stems such as Iran/Tehran/Ensan/Ostan. // - We signal f if a trailing pattern indicates lexical AN. // - This is a probe only: it NEVER edits the buffer; it only signals. // ============================================================================ define Protect_Lexical_AN as ( not AN_Exception not among ( '{sin}{te}{alef}{nun}' // ...stan '{sin}{alef}{nun}' // ...san '{re}{alef}{nun}' // ...ran '{vav}{alef}{nun}' // ...van: onvaan, divaan, javaan, ravaan ) ) // ============================================================================ // PHASE 3 — SUFFIX STRIPPING (backward) // - All routines in backward mode start matching from the end of the token. // - IMPORTANT: no successful "no-op" arms; every success must modify the text. // This ensures 'repeat' loops terminate. // - Most suffixes are only removed/replaced if they are entirely in R1 to // avoid over-stemming short words. // ============================================================================ // --- Lexical "-AN" suffix list. // Small curated set of stems ending in orthographic -AN that are not plural. // Used by `Protect_Lexical_AN` to prevent stripping on place names / lexical nouns. define AN_Exception as ( among ( '{aa}{lam}{mim}{alef}{nun}' // alman '{aa}{sin}{mim}{alef}{nun}' // asman '{alef}{ye}{mim}{alef}{nun}' // eiman '{alef}{ye}{shin}{alef}{nun}' // ishan '{alef}{mim}{kaf}{alef}{nun}' // emkan '{alef}{sad}{fe}{heh}{alef}{nun}' // esfahan '{aa}{zal}{re}{be}{alef}{ye}{jim}{alef}{nun}' // azerbaijan '{be}{ye}{alef}{nun}' // bayan '{pe}{alef}{ye}{alef}{nun}' // payan '{pe}{ye}{mim}{alef}{nun}' // payman '{jim}{re}{ye}{alef}{nun}' // jaryan '{dal}{re}{mim}{alef}{nun}' // darman '{re}{mim}{alef}{nun}' // roman '{ze}{nun}{dal}{alef}{nun}' // zendan '{sin}{alef}{ze}{mim}{alef}{nun}' // sazman '{sin}{lam}{ta}{alef}{nun}' // soltan '{gaf}{ye}{lam}{alef}{nun}' // Gilan '{ghaf}{heh}{re}{mim}{alef}{nun}' // ghahramaan '{kaf}{re}{mim}{alef}{nun}' // Kerman '{khe}{alef}{nun}{dal}{alef}{nun}' // khandan '{lam}{be}{nun}{alef}{nun}' // lobnan '{mim}{ye}{ze}{alef}{nun}' // mizan '{mim}{sin}{lam}{mim}{alef}{nun}' // mosalman '{nun}{shin}{alef}{nun}' // neshan '{heh}{mim}{dal}{alef}{nun}' // hamedan '{ye}{vav}{nun}{alef}{nun}' // yunan '{kaf}{heh}{kaf}{shin}{alef}{nun}' // kahkeshan (galaxy) '{aa}{te}{shin}{fe}{shin}{alef}{nun}' // atashfeshan (volcano) '{pe}{re}{ye}{shin}{alef}{nun}' // perishan (confused) '{dal}{re}{khe}{shin}{alef}{nun}' // darakhshan (shining) '{heh}{mim}{ze}{mim}{alef}{nun}' // hamzaman (simultaneous) '{sin}{alef}{khe}{te}{mim}{alef}{nun}' // sakhteman (building) '{sin}{lam}{ye}{mim}{alef}{nun}' // soleyman (Solomon) ( atlimit ) ) ) // --- Noun irregular rewrites. // Only include entries that actually change the buffer. define Irregular_Noun as ( [substring] among ( '{alef}{khe}{be}{alef}{re}' (<- '{khe}{be}{re}') '{alef}{sin}{alef}{te}{ye}{dal}' (<- '{alef}{sin}{te}{alef}{dal}') ) ) // --- Noun/Adjective step. define Stem_Noun_or_Adjective as ( Irregular_Noun or setlimit tomark p1 for ( [substring] among ( // Noun possessive suffixes '{alef}{mim}' (delete) // -AM '{alef}{shin}' (delete) // -ASH // -MAN/-TAN/-SHAN are intentionally disabled: these 3-char suffixes // are ambiguous with plurals of nouns whose root ends in the same letter: // ketab+maan (our book) -> strip -maan -> ketab (correct) // mardom+an (people pl.) -> strip -an -> mardom (correct, NOT mardom->mard) // naqqash+an (painters) -> strip -an -> naqqash (correct, NOT ->naqqaa) // derakht+an (trees) -> strip -an -> derakht (correct, NOT ->derakh) // Without a lexicon we cannot distinguish the two cases. The -AN // rule below handles both at the cost of missing possessive stripping. // '{mim}{alef}{nun}' (delete) // -MAN // '{te}{alef}{nun}' (delete) // -TAN // '{shin}{alef}{nun}' (delete) // -SHAN // Noun plurals. // -YAN and -GAN are productive. '{ye}{alef}{nun}' (delete) // -YAN '{gaf}{alef}{nun}' (delete) // -GAN '{heh}{alef}{ye}' (delete) // -HAY '{alef}{nun}{ye}' (delete) // -ANI '{heh}{alef}' (delete) // -HA '{alef}{te}' (delete) // -AT (Arabic sound plural; can also be possessive "your") '{alef}{nun}' (delete) // -AN (only if not lexical) '{ye}{nun}' (delete) // -IN (also adjective derivation ending) // Other derivational noun endings (conservative set). '{gaf}{alef}{heh}' (delete) // -GAH '{be}{alef}{nun}' (delete) // -BAN '{gaf}{ye}' (delete) // -GI (abstract noun) '{ye}{te}' (delete) // -YAT '{ye}{ye}' (delete) // -YY (double Y; also adj. relative -Y) // Adjective: comparative/superlative (HPS). '{te}{re}{ye}{nun}' (delete) // -TARIN '{te}{re}' (not atlimit delete) // -TAR // Adjective derivational endings (HPS list + common variants). '{alef}{nun}{heh}' (delete) // -ANE '{mim}{nun}{dal}' (delete) // -MAND '{vav}{alef}{re}' (delete) // -VAR '{nun}{alef}{kaf}' (delete) // -NAK '{gaf}{alef}{re}' (delete) // -GAR ) ) ) // --- Verb step. define Stem_Verb as ( ( // Participle + person/aux clitic tails [substring] among ( '{alef}{ye}{dal}' '{alef}{ye}{mim}' '{alef}{nun}{dal}' '{alef}{sin}{te}' '{alef}{sin}' '{alef}{ye}' '{ye}{dal}' '{ye}{mim}' (R1 delete) ) ) or ( [substring] among ( // Generic person and singular/plural endings — only when a verb cue is active '{alef}{nun}{dal}' // -AND '{ye}{dal}' // -ID '{ye}{mim}' // -IM '{alef}{mim}' // -AM '{dal}' // -D '{mim}' // -M ( remove_verb_person_endings R1 delete ) // Specific past-3sg stem fixes '{re}{fe}{te}{mim}' '{re}{fe}{te}{ye}' '{re}{fe}{te}{ye}{mim}' '{re}{fe}{te}{ye}{dal}' '{re}{fe}{te}{alef}{nun}{dal}' (<- '{re}{fe}{te}') // Mood/tense markers (HPS). '{nun}{dal}{heh}' // agent noun -NDEH → present root '{alef}{nun}' // infinitive -N / -AN (R1 delete set remove_verb_person_endings) '{dal}{heh}' // past part. -deh (not atlimit <-'{dal}' set remove_verb_person_endings) '{te}{heh}' // past part. -teh (not atlimit <-'{te}' set remove_verb_person_endings) ) ) ) ) // ============================================================================ // MAIN (HPS pipeline) // ============================================================================ define stem as ( unset saw_present_prefix // 1) Normalize script/spacing (ZWNJ preserved for prefix detection) do Normalize_Characters // 2) Handle leading prefixes (requires ZWNJ separator) do Prefixes // 2b) Delete remaining ZWNJs now that prefix detection is done do Delete_ZWNJ // Set p1 to be 3 characters into the string. $p1 = limit do ( hop 3 setmark p1 ) // 3) Remove suffixes. Each pass removes one suffix, trying noun and // adjective endings first, then verb endings. So if we remove a noun // or adjective ending, we next check for another noun or adjective // ending. // // Note: The loop exits if no suffix is removed and the buffer gets // shorter when a suffix is removed so we can't loop forever. backwards repeat test ( // Set remove_verb_person_endings = saw_present_prefix. unset remove_verb_person_endings try ( saw_present_prefix set remove_verb_person_endings ) Protect_Lexical_AN // HPS as described in the paper uses a POS tagger. We don't have that // information so we use a heuristic and try noun/adjective endings first, // then verb endings. Stem_Noun_or_Adjective or Stem_Verb ) ) snowball-3.1.0/algorithms/polish.sbl000066400000000000000000000176671520373054300175300ustar00rootroot00000000000000/* Polish stemmer. Author: Dmitry Shachnev */ stringescapes {} stringdef ak '{U+0105}' // ą a + ogonek stringdef ek '{U+0119}' // ę e + ogonek stringdef l/ '{U+0142}' // ł l + stroke stringdef c' '{U+0107}' // ć c + acute (kreska) stringdef n' '{U+0144}' // ń n + acute (kreska) stringdef o' '{U+00f3}' // ó o + acute (kreska) stringdef s' '{U+015b}' // ś s + acute (kreska) stringdef z' '{U+017a}' // ź z + acute (kreska) externals (stem) routines ( mark_regions remove_endings normalize_consonant R1 ) integers ( p1 ) groupings ( v ) define v 'a{ak}e{ek}io{o'}uy' define mark_regions as ( $p1 = limit gopast v gopast non-v setmark p1 ) backwardmode ( define R1 as ($p1 <= cursor) define remove_endings as ( // Verbs. do ( setlimit tomark p1 for ([substring]) among ( // conditionals: 'bym' // 1st person singular (czytał(a)bym) 'by{s'}' // 2nd person singular (czytał(a)byś) 'by{s'}my' // 1st person plural (czytalibyśmy) 'by{s'}cie' // 2nd person plural (czytalibyście) 'by' // 3rd person singular/plural (czytał(a)by, czytaliby) (delete) ) ) [substring] among ( 'asz' 'esz' 'isz' // present 2nd person singular (czytasz, piszesz, nosisz) 'amy' 'emy' 'imy' // present 1st person plural (czytamy, piszemy, nosimy) 'acie' 'ecie' 'icie' // present 2nd person plural (czytacie, piszecie, nosicie) 'aj{ak}' // present 3rd person plural (czytają) 'e{s'}{c'}' // infinitive (przynieść) 'a{s'}{c'}' // infinitive (popaść) 'a{c'}' // infinitive (czytać) 'ie{c'}' // infinitive (lecieć) 'i{c'}' // infinitive (wozić) '{ak}{c'}' // infinitive (marznąć) 'aj{ak}c' '{ak}c' // contemporary adverbial participle (transgressive) (czytając, lecąc) 'a{l/}em' 'ia{l/}em' 'i{l/}em' // past 1st person singular masculine (czytałem, leciałem, chodziłem) 'a{l/}am' 'ia{l/}am' 'i{l/}am' 'am' // past 1st person singular feminine (czytałam, leciałam, chodziłam, marzłam) 'a{l/}e{s'}' 'ia{l/}e{s'}' 'i{l/}e{s'}' // past 2nd person singular masculine (czytałeś, leciałeś, chodziłeś) 'a{l/}a{s'}' 'ia{l/}a{s'}' 'i{l/}a{s'}' // past 2nd person singular feminine (czytałaś, leciałaś, chodziłaś) 'a{l/}' 'ia{l/}' 'i{l/}' // past 3rd person singular masculine (czytał, leciał, chodził) 'a{l/}a' 'ia{l/}a' 'i{l/}a' // past 3rd person singular feminine (czytała, leciała, chodziła) 'a{l/}o' 'ia{l/}o' 'i{l/}o' // past 3rd person singular neuter (czytało, leciało, chodziło) 'ali{s'}my' 'ieli{s'}my' 'ili{s'}my' // past 1st person plural virile (czytaliśmy, lecieliśmy, chodziliśmy) 'a{l/}y{s'}my' 'ia{l/}y{s'}my' 'i{l/}y{s'}my' // past 1st person plural nonvirile (czytałyśmy, leciałyśmy, chodziłyśmy) 'ali{s'}cie' 'ieli{s'}cie' 'ili{s'}cie' // past 2nd person plural virile (czytaliście, lecieliście, chodziliście) 'a{l/}y{s'}cie' 'ia{l/}y{s'}cie' 'i{l/}y{s'}cie' // past 2nd person plural nonvirile (czytałyście, leciałyście, chodziłyście) 'ali' 'ieli' 'ili' // past 3rd person plural virile (czytali, lecieli, chodzili) 'a{l/}y' 'ia{l/}y' 'i{l/}y' // past 3rd person plural nonvirile (czytały, leciały, chodziły) 'aj' // imperative 2nd person singular (czytaj) 'ajcie' // imperative 2nd person plural (czytajcie) 'cie' // imperative 2nd person plural (chodźcie) '{ek}' // present 1st person singular (lecę) (delete) 'sz{ek}' // present 1st person singular (noszę) (<- 's') 'sz{ak}' // present 3rd person plural (noszą) // Also an adjectival form (singular feminine accusative), e.g. lepszą. // This heuristic does the right thing in common cases. (R1 and delete or <-'s') // There are short verbs whose root consists of only one consonant, e.g. być, żyć. // Stemming them to one letter would merge them with these letters used in other // contexts, which is undesirable. But let's at least merge all past tense forms // together, e.g. byłem, byłam, byłyśmy, etc. to był. '{l/}e{s'}' '{l/}a{s'}' 'li{s'}my' '{l/}y{s'}my' 'li{s'}cie' '{l/}y{s'}cie' (<- '{l/}') // Adjectives (including comparative/superlative forms) // as well as participles. 'y' // singular masculine nominative (nowy) 'ego' 'iego' // singular masculine genitive (nowego, polskiego) 'emu' 'iemu' // singular masculine dative (nowemu, polskiemu) 'ym' 'im' // singular masculine instrumental (nowym, polskim) 'ej' 'iej' // singular feminine genitive (nowej, polskiej) 'ych' 'ich' // plural genitive (nowych, polskich) 'ymi' 'imi' // plural instrumental (nowymi, polskimi) ( delete try ( [substring] among ( 'aj{ak}c' // participle suffix (czytający) '{ak}c' // participle suffix (lecący) 'iejsz' // comparative suffix (piękniejszy) 'sz' // comparative suffix (lepszy) (delete) 'sz{ak}c' // participle suffix (noszący) (<- 's') ) ) ) // We cannot remove endings like -ą and -e unconditionally, because these // letters appear in too many contexts. But we can safely remove them if we // know that our word is a participle or a comparative/superlative form. 'aj{ak}ca' '{ak}ca' 'iejsza' 'sza' // singular feminine nominative (czytająca, lecąca, piękniejsza, lepsza) 'aj{ak}c{ak}' '{ak}c{ak}' 'iejsz{ak}' // singular feminine accusative (czytającą, lecącą, piękniejszą); -szą is handled separately 'aj{ak}ce' '{ak}ce' 'iejsze' 'sze' // singular neuter nominative (czytające, lecące, piękniejsze, lepsze) (delete) // Handle participles like nosząca, prosząca. 'sz{ak}ca' 'sz{ak}c{ak}' 'sz{ak}ce' (<- 's') // Noun forms (excluding endings that were already handled above). 'a' R1 'o' R1 // singular nominative (książka, lato) 'i' R1 'u' R1 'ia' R1 // singular genitive (książki, stołu, słonia) 'owi' R1 'iowi' R1 // singular dative (stołowi, słoniowi) '{ak}' R1 'i{ak}' R1 'em' R1 'iem' R1 // singular instrumental (książką, możliwością, stołem, słoniem) 'e' R1 'iu' R1 // singular locative (stole, słoniu) 'ie' R1 // plural nominative (słonie) '{o'}w' R1 // plural genitive (stołów) 'om' R1 'iom' R1 // plural dative (książkom, słoniom) 'ami' R1 'iami' R1 // plural instrumental (książkami, słoniami) 'ach' R1 'iach' R1 // plural locative (książkach, słoniach) (delete) ) try (['{'}'] delete) ) define normalize_consonant as ( // Remove kreska mark, because most of oblique cases do not have it. // Don't mutate single character inputs. [substring] not atlimit among ( '{c'}' (<- 'c') // e.g. miłość → miłośc '{n'}' (<- 'n') // e.g. słoń → słon '{s'}' (<- 's') // e.g. gęś → gęs '{z'}' (<- 'z') // e.g. miedź → miedz ) ) ) define stem as ( do mark_regions // Make sure we don't produce too short outputs. The "backwards" will // set the backwards limit to the current cursor position. ( hop 2 backwards remove_endings ) or ( backwards normalize_consonant ) ) snowball-3.1.0/algorithms/porter.sbl000066400000000000000000000056541520373054300175360ustar00rootroot00000000000000integers ( p1 p2 ) booleans ( Y_found ) routines ( shortv R1 R2 Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b ) externals ( stem ) groupings ( v v_WXY ) define v 'aeiouy' define v_WXY v + 'wxY' backwardmode ( define shortv as ( non-v_WXY v non-v ) define R1 as $p1 <= cursor define R2 as $p2 <= cursor define Step_1a as ( [substring] among ( 'sses' (<-'ss') 'ies' (<-'i') 'ss' () 's' (delete) ) ) define Step_1b as ( [substring] among ( 'eed' (R1 <-'ee') 'ed' 'ing' ( test gopast v delete test substring among( 'at' 'bl' 'iz' (insert 'e') 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' // ignoring double c, h, j, k, q, v, w, and x ([next] delete) '' (atmark p1 test shortv insert 'e') ) ) ) ) define Step_1c as ( ['y' or 'Y'] gopast v <-'i' ) define Step_2 as ( [substring] R1 among ( 'tional' (<-'tion') 'enci' (<-'ence') 'anci' (<-'ance') 'abli' (<-'able') 'entli' (<-'ent') 'eli' (<-'e') 'izer' 'ization' (<-'ize') 'ational' 'ation' 'ator' (<-'ate') 'alli' (<-'al') 'alism' 'aliti' (<-'al') 'fulness' (<-'ful') 'ousli' 'ousness' (<-'ous') 'iveness' 'iviti' (<-'ive') 'biliti' (<-'ble') ) ) define Step_3 as ( [substring] R1 among ( 'alize' (<-'al') 'icate' 'iciti' 'ical' (<-'ic') 'ative' 'ful' 'ness' (delete) ) ) define Step_4 as ( [substring] R2 among ( 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' 'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' (delete) 'ion' ('s' or 't' delete) ) ) define Step_5a as ( ['e'] R2 or (R1 not shortv) delete ) define Step_5b as ( ['l'] R2 'l' delete ) ) define stem as ( unset Y_found do ( ['y'] <-'Y' set Y_found) do repeat(goto (v ['y']) <-'Y' set Y_found) $p1 = limit $p2 = limit do( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) backwards ( do Step_1a do Step_1b do Step_1c do Step_2 do Step_3 do Step_4 do Step_5a do Step_5b ) do(Y_found repeat(goto (['Y']) <-'y')) ) snowball-3.1.0/algorithms/portuguese.sbl000066400000000000000000000126261520373054300204220ustar00rootroot00000000000000routines ( prelude postlude mark_regions RV R1 R2 standard_suffix verb_suffix residual_suffix residual_form ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' // a-acute stringdef a^ '{U+00E2}' // a-circumflex e.g. 'bota^nico stringdef e' '{U+00E9}' // e-acute stringdef e^ '{U+00EA}' // e-circumflex stringdef i' '{U+00ED}' // i-acute stringdef o^ '{U+00F4}' // o-circumflex stringdef o' '{U+00F3}' // o-acute stringdef u' '{U+00FA}' // u-acute stringdef cc '{U+00E7}' // c-cedilla stringdef a~ '{U+00E3}' // a-tilde stringdef o~ '{U+00F5}' // o-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' define prelude as repeat ( [substring] among( '{a~}' (<- 'a~') '{o~}' (<- 'o~') '' (next) ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'a~' (<- '{a~}') 'o~' (<- '{o~}') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( [substring] among( 'eza' 'ezas' 'ico' 'ica' 'icos' 'icas' 'ismo' 'ismos' '{a'}vel' '{i'}vel' 'ista' 'istas' 'oso' 'osa' 'osos' 'osas' 'amento' 'amentos' 'imento' 'imentos' 'adora' 'ador' 'a{cc}a~o' 'adoras' 'adores' 'a{cc}o~es' // no -ic test 'ante' 'antes' '{a^}ncia' ( R2 delete ) 'logia' 'logias' ( R2 <- 'log' ) 'u{cc}a~o' 'u{cc}o~es' ( R2 <- 'u' ) '{e^}ncia' '{e^}ncias' ( R2 <- 'ente' ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' (['at'] R2 delete) 'os' 'ic' 'ad' ) ) ) 'mente' ( R2 delete try ( [substring] among( 'ante' 'avel' '{i'}vel' (R2 delete) ) ) ) 'idade' 'idades' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'iva' 'ivo' 'ivas' 'ivos' ( R2 delete try ( ['at'] R2 delete // but not a further ['ic'] R2 delete ) ) 'ira' 'iras' ( RV 'e' // -eira -eiras usually non-verbal <- 'ir' ) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' 'ira' 'iras' (delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'i' 'o' '{a'}' '{i'}' '{o'}' ( RV delete ) ) ) define residual_form as ( [substring] among( 'e' '{e'}' '{e^}' ( RV delete [('u'] test 'g') or ('i'] test 'c') RV delete ) '{cc}' (<-'c') ) ) ) define stem as ( do prelude do mark_regions backwards ( do ( ( ( standard_suffix or verb_suffix ) and do ( ['i'] test 'c' RV delete ) ) or residual_suffix ) do residual_form ) do postlude ) snowball-3.1.0/algorithms/romanian.sbl000066400000000000000000000146041520373054300200220ustar00rootroot00000000000000 routines ( norm prelude postlude mark_regions RV R1 R2 step_0 standard_suffix combo_suffix verb_suffix vowel_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) booleans ( standard_suffix_removed ) stringescapes {} /* special characters */ stringdef a^ '{U+00E2}' // a circumflex stringdef i^ '{U+00EE}' // i circumflex stringdef a+ '{U+0103}' // a breve stringdef sc '{U+015F}' // s cedilla stringdef tc '{U+0163}' // t cedilla stringdef s, '{U+0219}' // s comma stringdef t, '{U+021B}' // t comma define v 'aeiou{a^}{i^}{a+}' // Normalize old cedilla forms to correct comma-below forms. define norm as ( do repeat goto ( [substring] among ( '{sc}' (<- '{s,}') '{tc}' (<- '{t,}') ) ) ) define prelude as ( repeat goto ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define step_0 as ( [substring] R1 among( 'ul' 'ului' ( delete ) 'aua' ( <-'a' ) 'ea' 'ele' 'elor' ( <-'e' ) 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' ( <-'i') 'ile' ( not 'ab' <- 'i' ) 'atei' ( <- 'at' ) 'a{t,}ie' 'a{t,}ia' ( <- 'a{t,}i' ) ) ) define combo_suffix as test ( [substring] R1 ( among( /* 'IST'. alternative: include the following 'alism' 'alisme' 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( <- 'al' ) */ 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( <- 'abil' ) 'ibilitate' ( <- 'ibil' ) 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( <- 'iv' ) 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' 'icator' 'icatori' 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' 'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( <- 'ic' ) 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' 'atoare' 'ator' 'atori' '{a+}toare' '{a+}tor' '{a+}tori' ( <- 'at' ) 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' 'itoare' 'itor' 'itori' ( <- 'it' ) ) set standard_suffix_removed ) ) define standard_suffix as ( unset standard_suffix_removed repeat combo_suffix [substring] R2 ( among( // past participle is treated here, rather than // as a verb ending: 'at' 'ata' 'at{a+}' 'ati' 'ate' 'ut' 'uta' 'ut{a+}' 'uti' 'ute' 'it' 'ita' 'it{a+}' 'iti' 'ite' 'ic' 'ica' 'ice' 'ici' 'ic{a+}' 'abil' 'abila' 'abile' 'abili' 'abil{a+}' 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' 'ant' 'anta' 'ante' 'anti' 'ant{a+}' 'ator' 'atori' 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( delete ) 'iune' 'iuni' ( '{t,}'] <- 't' ) 'ism' 'isme' 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( <- 'ist' /* 'IST'. alternative: remove with <- '' */ ) ) set standard_suffix_removed ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( // 'long' infinitive: 'are' 'ere' 'ire' '{a^}re' // gerund: 'ind' '{a^}nd' 'indu' '{a^}ndu' 'eze' 'easc{a+}' // present: 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' 'e{s,}te' '{a+}sc' '{a+}{s,}ti' '{a+}{s,}te' // imperfect: 'am' 'ai' 'au' 'eam' 'eai' 'ea' 'ea{t,}i' 'eau' 'iam' 'iai' 'ia' 'ia{t,}i' 'iau' // past: // (not 'ii') 'ui' 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' // pluperfect: 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' '{a^}ser{a+}' 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' ( non-v or 'u' delete ) // present: '{a+}m' 'a{t,}i' 'em' 'e{t,}i' 'im' 'i{t,}i' '{a^}m' '{a^}{t,}i' // past: 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' 'sei' 'se' // pluperfect: 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' (delete) ) ) define vowel_suffix as ( [substring] RV among ( 'a' 'e' 'i' 'ie' '{a+}' ( delete ) ) ) ) define stem as ( do norm do prelude do mark_regions backwards ( do step_0 do standard_suffix do ( standard_suffix_removed or verb_suffix ) do vowel_suffix ) do postlude ) snowball-3.1.0/algorithms/russian.sbl000066400000000000000000000143451520373054300177040ustar00rootroot00000000000000stringescapes {} /* the 33 Cyrillic letters represented in ASCII characters following the * conventions of the standard Library of Congress transliteration: */ stringdef a '{U+0430}' stringdef b '{U+0431}' stringdef v '{U+0432}' stringdef g '{U+0433}' stringdef d '{U+0434}' stringdef e '{U+0435}' stringdef e" '{U+0451}' stringdef zh '{U+0436}' stringdef z '{U+0437}' stringdef i '{U+0438}' stringdef i` '{U+0439}' stringdef k '{U+043A}' stringdef l '{U+043B}' stringdef m '{U+043C}' stringdef n '{U+043D}' stringdef o '{U+043E}' stringdef p '{U+043F}' stringdef r '{U+0440}' stringdef s '{U+0441}' stringdef t '{U+0442}' stringdef u '{U+0443}' stringdef f '{U+0444}' stringdef kh '{U+0445}' stringdef ts '{U+0446}' stringdef ch '{U+0447}' stringdef sh '{U+0448}' stringdef shch '{U+0449}' stringdef " '{U+044A}' stringdef y '{U+044B}' stringdef ' '{U+044C}' stringdef e` '{U+044D}' stringdef iu '{U+044E}' stringdef ia '{U+044F}' routines ( mark_regions R2 perfective_gerund adjective adjectival reflexive verb noun derivational tidy_up ) externals ( stem ) integers ( pV p2 ) groupings ( v ) define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' define mark_regions as ( $pV = limit $p2 = limit do ( gopast v setmark pV gopast non-v gopast v gopast non-v setmark p2 ) ) backwardmode ( define R2 as $p2 <= cursor define perfective_gerund as ( [substring] among ( '{v}' '{v}{sh}{i}' '{v}{sh}{i}{s}{'}' ('{a}' or '{ia}' delete) '{i}{v}' '{i}{v}{sh}{i}' '{i}{v}{sh}{i}{s}{'}' '{y}{v}' '{y}{v}{sh}{i}' '{y}{v}{sh}{i}{s}{'}' (delete) ) ) define adjective as ( [substring] among ( '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' '{ia}{ia}' // and - '{o}{iu}' // - which is somewhat archaic '{e}{iu}' // - soft form of {o}{iu} (delete) ) ) define adjectival as ( adjective /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of errors. Removing im, uem, enn creates too many errors. */ try ( [substring] among ( '{e}{m}' // present passive participle '{n}{n}' // adjective from past passive participle '{v}{sh}' // past active participle '{iu}{shch}' '{shch}' // present active participle ('{a}' or '{ia}' delete) //but not '{i}{m}' '{u}{e}{m}' // present passive participle //or '{e}{n}{n}' // adjective from past passive participle '{i}{v}{sh}' '{y}{v}{sh}'// past active participle '{u}{iu}{shch}' // present active participle (delete) ) ) ) define reflexive as ( [substring] among ( '{s}{ia}' '{s}{'}' (delete) ) ) define verb as ( [substring] among ( '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' '{n}{y}' '{t}{'}' '{e}{sh}{'}' '{n}{n}{o}' ('{a}' or '{ia}' delete) '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' (delete) /* note the short passive participle tests: '{n}{a}' '{n}' '{n}{o}' '{n}{y}' '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' */ ) ) define noun as ( [substring] among ( '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' (delete) /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' omitted - they only occur on 12 words. */ ) ) define derivational as ( [substring] R2 among ( '{o}{s}{t}' '{o}{s}{t}{'}' (delete) ) ) define tidy_up as ( [substring] among ( '{e}{i`}{sh}' '{e}{i`}{sh}{e}' // superlative forms (delete ['{n}'] '{n}' delete ) '{n}' ('{n}' delete) // e.g. -nno endings '{'}' (delete) // with some slight false conflations ) ) ) define stem as ( // Normalise {e"} to {e}. The documentation has long suggested the user // should do this before calling the stemmer - we now do it for them. do repeat ( goto (['{e"}']) <- '{e}' ) do mark_regions backwards setlimit tomark pV for ( do ( perfective_gerund or ( try reflexive adjectival or verb or noun ) ) try([ '{i}' ] delete) // because noun ending -i{iu} is being treated as verb ending -{iu} do derivational do tidy_up ) ) snowball-3.1.0/algorithms/serbian.sbl000066400000000000000000001530621520373054300176430ustar00rootroot00000000000000/* Stemmer for Serbian language, based on: * * Ljubesic, Nikola. Pandzic, Ivan. Stemmer for Croatian * http://nlp.ffzg.hr/resources/tools/stemmer-for-croatian/ * * authors: Stefan Petkovic and Dragan Ivanovic * emails: petkovic8 at gmail.com and dragan.ivanovic at uns.ac.rs */ routines ( cyr_to_lat prelude mark_regions R1 Step_1 Step_2 Step_3 ) externals ( stem ) booleans ( no_diacritics ) integers ( p1 ) groupings ( v ca sa rg ) stringescapes {} /* special characters - Unicode codepoints */ /* serbian cyrillic */ stringdef cyrA '{U+0430}' stringdef cyrB '{U+0431}' stringdef cyrV '{U+0432}' stringdef cyrG '{U+0433}' stringdef cyrD '{U+0434}' stringdef cyrDx '{U+0452}' stringdef cyrE '{U+0435}' stringdef cyrZh '{U+0436}' stringdef cyrZ '{U+0437}' stringdef cyrI '{U+0438}' stringdef cyrJ '{U+0458}' stringdef cyrK '{U+043A}' stringdef cyrL '{U+043B}' stringdef cyrLJ '{U+0459}' stringdef cyrM '{U+043C}' stringdef cyrN '{U+043D}' stringdef cyrNJ '{U+045A}' stringdef cyrO '{U+043E}' stringdef cyrP '{U+043F}' stringdef cyrR '{U+0440}' stringdef cyrS '{U+0441}' stringdef cyrT '{U+0442}' stringdef cyrCy '{U+045B}' stringdef cyrU '{U+0443}' stringdef cyrF '{U+0444}' stringdef cyrH '{U+0445}' stringdef cyrC '{U+0446}' stringdef cyrCx '{U+0447}' stringdef cyrDzx '{U+045F}' stringdef cyrSx '{U+0448}' /* serbian latin with diacritics */ stringdef cv '{U+010D}' // small c with caron stringdef c' '{U+0107}' // small c with acute stringdef zv '{U+017E}' // small z with caron stringdef sv '{U+0161}' // small s with caron stringdef d/ '{U+0111}' // small d with stroke define v 'aeiou' define sa '{cv}{c'}{zv}{sv}{d/}' define ca 'bvgdzjklmnprstfhc' + sa define rg 'r' define cyr_to_lat as ( do repeat goto ( [substring] among ( '{cyrA}' (<- 'a') '{cyrB}' (<- 'b') '{cyrV}' (<- 'v') '{cyrG}' (<- 'g') '{cyrD}' (<- 'd') '{cyrDx}' (<- '{d/}') '{cyrE}' (<- 'e') '{cyrZh}' (<- '{zv}') '{cyrZ}' (<- 'z') '{cyrI}' (<- 'i') '{cyrJ}' (<- 'j') '{cyrK}' (<- 'k') '{cyrL}' (<- 'l') '{cyrLJ}' (<- 'lj') '{cyrM}' (<- 'm') '{cyrN}' (<- 'n') '{cyrNJ}' (<- 'nj') '{cyrO}' (<- 'o') '{cyrP}' (<- 'p') '{cyrR}' (<- 'r') '{cyrS}' (<- 's') '{cyrT}' (<- 't') '{cyrCy}' (<- '{c'}') '{cyrU}' (<- 'u') '{cyrF}' (<- 'f') '{cyrH}' (<- 'h') '{cyrC}' (<- 'c') '{cyrCx}' (<- '{cv}') '{cyrDzx}' (<- 'd{zv}') '{cyrSx}' (<- '{sv}') ) ) ) define prelude as ( do repeat goto ( ca ['ije'] ca <- 'e' ) do repeat goto ( ca ['je'] ca <- 'e' ) do repeat goto ( ['dj'] <- '{d/}' ) ) define mark_regions as ( set no_diacritics do ( gopast sa unset no_diacritics ) $p1 = limit do ( gopast v setmark p1 ($p1 < 2) ( gopast non-v setmark p1 ) ) do ( gopast 'r' $(cursor >= 2) or (gopast non-rg) $(p1 - cursor > 1) setmark p1 ) ) backwardmode ( define R1 as $p1 <= cursor define Step_1 as ( [substring] among ( 'lozi' 'lozima' (<-'loga') 'pesi' 'pesima' (<-'peh') 'vojci' (<-'vojka') 'bojci' (<-'bojka') 'jaci' 'jacima' (<-'jak') '{cv}ajan' (<-'{cv}ajni') 'cajan' (no_diacritics <-'cajni') 'eran' (<-'erni') 'laran' (<-'larni') 'esan' (<-'esni') 'anjac' (<-'anjca') 'ajac' 'ajaca' (<-'ajca') 'ljaca' 'ljac' (<-'ljca') 'ejac' 'ejaca' (<-'ejca') 'ojac' 'ojaca' (<-'ojca') 'ajaka' (<-'ajka') 'ojaka' (<-'ojka') '{sv}aca' '{sv}ac' (<-'{sv}ca') 'inzima' 'inzi' (<-'ing') 'tvenici' (<-'tvenik') 'tetici' 'teticima' (<-'tetika') 'nstava' (<-'nstva') 'nicima' (<-'nik') 'ticima' (<-'tik') 'zicima' (<-'zik') 'snici' (<-'snik') 'kuse' (<-'kusi') 'kusan' (<-'kusni') 'kustava' (<-'kustva') 'du{sv}an' (<-'du{sv}ni') 'dusan' (no_diacritics <-'dusni') 'antan' (<-'antni') 'bilan' (<-'bilni') 'tilan' (<-'tilni') 'avilan' (<-'avilni') 'silan' (<-'silni') 'gilan' (<-'gilni') 'rilan' (<-'rilni') 'nilan' (<-'nilni') 'alan' (<-'alni') 'ozan' (<-'ozni') 'rave' (<-'ravi') 'stavan' (<-'stavni') 'pravan' (<-'pravni') 'tivan' (<-'tivni') 'sivan' (<-'sivni') 'atan' (<-'atni') 'enat' (<-'enta') 'tetan' (<-'tetni') 'pletan' (<-'pletni') '{sv}ave' (<-'{sv}avi') 'save' (no_diacritics <-'savi') 'anata' (<-'anta') 'a{cv}ak' 'a{cv}aka' (<-'a{cv}ka') 'acak' 'acaka' (no_diacritics <-'acka') 'u{sv}ak' (<-'u{sv}ka') 'usak' (no_diacritics <-'uska') 'atak' 'ataka' 'atci' 'atcima' (<-'atka') 'etak' 'etaka' (<-'etka') 'itak' 'itaka' 'itci' (<-'itka') 'otak' 'otaka' (<-'otka') 'utak' 'utaka' 'utci' 'utcima' (<-'utka') 'eskan' (<-'eskna') 'ti{cv}an' (<-'ti{cv}ni') 'tican' (no_diacritics <-'ticni') 'ojsci' (<-'ojska') 'esama' (<-'esma') 'metar' 'metara' (<-'metra') 'centar' 'centara' (<-'centra') 'istar' 'istara' (<-'istra') 'o{sv}{c'}u' (<-'osti') 'oscu' (no_diacritics <-'osti') 'daba' (<-'dba') '{cv}cima' '{cv}ci' (<-'{cv}ka') 'mac' 'maca' (<-'mca') 'naca' 'nac' (<-'nca') 'voljan' (<-'voljni') 'anaka' (<-'anki') 'vac' 'vaca' (<-'vca') 'saca' 'sac' (<-'sca') 'raca' 'rac' (<-'rca') 'aoca' 'alaca' 'alac' (<-'alca') 'elaca' 'elac' (<-'elca') 'olaca' 'olac' 'olce' (<-'olca') 'njac' 'njaca' (<-'njca') 'ekata' 'ekat' (<-'ekta') 'izam' 'izama' (<-'izma') 'jebe' (<-'jebi') 'baci' (<-'baci') 'a{sv}an' (<-'a{sv}ni') 'asan' (no_diacritics <-'asni') ) ) define Step_2 as ( [substring] R1 among ( 'skijima' 'skijega' 'skijemu' 'skijem' 'skega' 'skemu' 'skem' 'skijim' 'skijih' 'skijoj' 'skijeg' 'skiji' 'skije' 'skija' 'skoga' 'skome' 'skomu' 'skima' 'skog' 'skom' 'skim' 'skih' 'skoj' 'ski' 'ske' 'sko' 'ska' 'sku' (<-'sk') '{sv}kijima' '{sv}kijega' '{sv}kijemu' '{sv}kijem' '{sv}kega' '{sv}kemu' '{sv}kem' '{sv}kijim' '{sv}kijih' '{sv}kijoj' '{sv}kijeg' '{sv}kiji' '{sv}kije' '{sv}kija' '{sv}koga' '{sv}kome' '{sv}komu' '{sv}kima' '{sv}kog' '{sv}kom' '{sv}kim' '{sv}kih' '{sv}koj' '{sv}ki' '{sv}ke' '{sv}ko' '{sv}ka' '{sv}ku' (<-'{sv}k') 'stvima' 'stvom' 'stvo' 'stva' 'stvu' (<-'stv') '{sv}tvima' '{sv}tvom' '{sv}tvo' '{sv}tva' '{sv}tvu' (<-'{sv}tv') 'tanijama' 'tanijima' 'tanijom' 'tanija' 'taniju' 'tanije' 'taniji' (<-'tanij') 'manijama' 'manijima' 'manijom' 'manija' 'maniju' 'manije' 'maniji' (<-'manij') 'panijama' 'panijima' 'panijom' 'panija' 'paniju' 'panije' 'paniji' (<-'panij') 'ranijama' 'ranijima' 'ranijom' 'ranija' 'raniju' 'ranije' 'raniji' (<-'ranij') 'ganijama' 'ganijima' 'ganijom' 'ganija' 'ganiju' 'ganije' 'ganiji' (<-'ganij') 'aninom' 'anina' 'aninu' 'anine' 'anima' 'anin' 'anom' 'anu' 'ani' 'ana' 'ane' (<-'an') 'inima' 'inama' 'inom' 'ina' 'ine' 'ini' 'inu' 'ino' (<-'in') 'onovima' 'onova' 'onove' 'onovi' 'onima' 'onom' 'ona' 'one' 'oni' 'onu' (<-'on') 'nijima' 'nijega' 'nijemu' 'nijeg' 'nijem' 'nega' 'nemu' 'neg' 'nem' 'nijim' 'nijih' 'nijoj' 'niji' 'nije' 'nija' 'niju' 'nima' 'nome' 'nomu' 'noga' 'noj' 'nom' 'nih' 'nim' 'nog' 'no' 'ne' 'na' 'nu' 'ni' (<-'n') 'a{c'}oga' 'a{c'}ome' 'a{c'}omu' 'a{c'}ega' 'a{c'}emu' 'a{c'}ima' 'a{c'}oj' 'a{c'}ih' 'a{c'}om' 'a{c'}eg' 'a{c'}em' 'a{c'}og' 'a{c'}uh' 'a{c'}im' 'a{c'}e' 'a{c'}a' (<-'a{c'}') 'e{c'}oga' 'e{c'}ome' 'e{c'}omu' 'e{c'}ega' 'e{c'}emu' 'e{c'}ima' 'e{c'}oj' 'e{c'}ih' 'e{c'}om' 'e{c'}eg' 'e{c'}em' 'e{c'}og' 'e{c'}uh' 'e{c'}im' 'e{c'}e' 'e{c'}a' (<-'e{c'}') 'u{c'}oga' 'u{c'}ome' 'u{c'}omu' 'u{c'}ega' 'u{c'}emu' 'u{c'}ima' 'u{c'}oj' 'u{c'}ih' 'u{c'}om' 'u{c'}eg' 'u{c'}em' 'u{c'}og' 'u{c'}uh' 'u{c'}im' 'u{c'}e' 'u{c'}a' (<-'u{c'}') 'ugovima' 'ugovi' 'ugove' 'ugova' (<-'ugov') 'ugama' 'ugom' 'uga' 'uge' 'ugi' 'ugu' 'ugo' (<-'ug') 'logama' 'logom' 'loga' 'logu' 'loge' (<-'log') 'govima' 'gama' 'govi' 'gove' 'gova' 'gom' 'ga' 'ge' 'gi' 'gu' 'go' (<-'g') 'rarijem' 'rarija' 'rariju' 'rario' (<-'rari') 'otijem' 'otija' 'otiju' 'otio' (<-'oti') 'sijem' 'sija' 'siju' 'sio' (<-'si') 'lijem' 'lija' 'liju' 'lio' (<-'li') 'uju{c'}i' 'ujemo' 'ujete' 'ujmo' 'ujem' 'uje{sv}' 'uje' 'uju' (<-'uj') 'cajevima' 'cajevi' 'cajeva' 'cajeve' 'cajama' 'cajima' 'cajem' 'caja' 'caje' 'caji' 'caju' (<-'caj') '{cv}ajevima' '{cv}ajevi' '{cv}ajeva' '{cv}ajeve' '{cv}ajama' '{cv}ajima' '{cv}ajem' '{cv}aja' '{cv}aje' '{cv}aji' '{cv}aju' (<-'{cv}aj') '{c'}ajevima' '{c'}ajevi' '{c'}ajeva' '{c'}ajeve' '{c'}ajama' '{c'}ajima' '{c'}ajem' '{c'}aja' '{c'}aje' '{c'}aji' '{c'}aju' (<-'{c'}aj') '{d/}ajevima' '{d/}ajevi' '{d/}ajeva' '{d/}ajeve' '{d/}ajama' '{d/}ajima' '{d/}ajem' '{d/}aja' '{d/}aje' '{d/}aji' '{d/}aju' (<-'{d/}aj') 'lajevima' 'lajevi' 'lajeva' 'lajeve' 'lajama' 'lajima' 'lajem' 'laja' 'laje' 'laji' 'laju' (<-'laj') 'rajevima' 'rajevi' 'rajeva' 'rajeve' 'rajama' 'rajima' 'rajem' 'raja' 'raje' 'raji' 'raju' (<-'raj') 'bijima' 'bijama' 'bijom' 'bija' 'bije' 'biji' 'biju' 'bijo' (<-'bij') 'cijima' 'cijama' 'cijom' 'cija' 'cije' 'ciji' 'ciju' 'cijo' (<-'cij') 'dijima' 'dijama' 'dijom' 'dija' 'dije' 'diji' 'diju' 'dijo' (<-'dij') 'lijima' 'lijama' 'lijom' 'lije' 'liji' 'lijo' (<-'lij') 'nijama' 'nijom' 'nijo' (<-'nij') 'mijima' 'mijama' 'mijom' 'mija' 'mije' 'miji' 'miju' 'mijo' (<-'mij') '{zv}ijima' '{zv}ijama' '{zv}ijom' '{zv}ija' '{zv}ije' '{zv}iji' '{zv}iju' '{zv}ijo' (<-'{zv}ij') 'gijima' 'gijama' 'gijom' 'gija' 'gije' 'giji' 'giju' 'gijo' (<-'gij') 'fijima' 'fijama' 'fijom' 'fija' 'fije' 'fiji' 'fiju' 'fijo' (<-'fij') 'pijima' 'pijama' 'pijom' 'pija' 'pije' 'piji' 'piju' 'pijo' (<-'pij') 'rijima' 'rijama' 'rijom' 'rija' 'rije' 'riji' 'riju' 'rijo' (<-'rij') 'sijima' 'sijama' 'sijom' 'sije' 'siji' 'sijo' (<-'sij') 'tijima' 'tijama' 'tijom' 'tija' 'tije' 'tiji' 'tiju' 'tijo' (<-'tij') 'zijima' 'zijama' 'zijom' 'zija' 'zije' 'ziji' 'ziju' 'zijo' (<-'zij') 'nalima' 'nalama' 'nalom' 'nala' 'nale' 'nali' 'nalu' 'nalo' (<-'nal') 'ijalima' 'ijalama' 'ijalom' 'ijala' 'ijale' 'ijali' 'ijalu' 'ijalo' (<-'ijal') 'ozilima' 'ozilom' 'ozila' 'ozile' 'ozilu' 'ozili' (<-'ozil') 'olovima' 'olovi' 'olova' 'olove' (<-'olov') 'olima' 'olom' 'ola' 'olu' 'ole' 'oli' (<-'ol') 'lemama' 'lemima' 'lemom' 'lema' 'leme' 'lemi' 'lemu' 'lemo' (<-'lem') 'ramama' 'ramom' 'rama' 'rame' 'rami' 'ramu' 'ramo' (<-'ram') 'arama' 'arima' 'arom' 'aru' 'ara' 'are' 'ari' (<-'ar') 'drama' 'drima' 'drom' 'dru' 'dra' 'dre' 'dri' (<-'dr') 'erama' 'erima' 'erom' 'eru' 'era' 'ere' 'eri' (<-'er') 'orama' 'orima' 'orom' 'oru' 'ora' 'ore' 'ori' (<-'or') 'esima' 'esom' 'ese' 'esa' 'esu' (<-'es') 'isima' 'isom' 'ise' 'isa' 'isu' (<-'is') 'ta{sv}ama' 'ta{sv}ima' 'ta{sv}om' 'ta{sv}em' 'ta{sv}a' 'ta{sv}u' 'ta{sv}i' 'ta{sv}e' (<-'ta{sv}') 'na{sv}ama' 'na{sv}ima' 'na{sv}om' 'na{sv}em' 'na{sv}a' 'na{sv}u' 'na{sv}i' 'na{sv}e' (<-'na{sv}') 'ja{sv}ama' 'ja{sv}ima' 'ja{sv}om' 'ja{sv}em' 'ja{sv}a' 'ja{sv}u' 'ja{sv}i' 'ja{sv}e' (<-'ja{sv}') 'ka{sv}ama' 'ka{sv}ima' 'ka{sv}om' 'ka{sv}em' 'ka{sv}a' 'ka{sv}u' 'ka{sv}i' 'ka{sv}e' (<-'ka{sv}') 'ba{sv}ama' 'ba{sv}ima' 'ba{sv}om' 'ba{sv}em' 'ba{sv}a' 'ba{sv}u' 'ba{sv}i' 'ba{sv}e' (<-'ba{sv}') 'ga{sv}ama' 'ga{sv}ima' 'ga{sv}om' 'ga{sv}em' 'ga{sv}a' 'ga{sv}u' 'ga{sv}i' 'ga{sv}e' (<-'ga{sv}') 'va{sv}ama' 'va{sv}ima' 'va{sv}om' 'va{sv}em' 'va{sv}a' 'va{sv}u' 'va{sv}i' 'va{sv}e' (<-'va{sv}') 'e{sv}ima' 'e{sv}ama' 'e{sv}om' 'e{sv}em' 'e{sv}i' 'e{sv}e' 'e{sv}a' 'e{sv}u' (<-'e{sv}') 'i{sv}ima' 'i{sv}ama' 'i{sv}om' 'i{sv}em' 'i{sv}i' 'i{sv}e' 'i{sv}a' 'i{sv}u' (<-'i{sv}') 'ikatima' 'ikatom' 'ikata' 'ikate' 'ikati' 'ikatu' 'ikato' (<-'ikat') 'latima' 'latom' 'lata' 'late' 'lati' 'latu' 'lato' (<-'lat') 'etama' 'etima' 'etom' 'eta' 'ete' 'eti' 'etu' 'eto' (<-'et') 'estima' 'estama' 'estom' 'esta' 'este' 'esti' 'estu' 'esto' (<-'est') 'istima' 'istama' 'istom' 'ista' 'iste' 'isti' 'istu' 'isto' (<-'ist') 'kstima' 'kstama' 'kstom' 'ksta' 'kste' 'ksti' 'kstu' 'ksto' (<-'kst') 'ostima' 'ostama' 'ostom' 'osta' 'oste' 'osti' 'ostu' 'osto' (<-'ost') 'i{sv}tima' 'i{sv}tem' 'i{sv}ta' 'i{sv}te' 'i{sv}tu' (<-'i{sv}t') 'ovasmo' 'ovaste' 'ovahu' 'ovati' 'ova{sv}e' 'ovali' 'ovala' 'ovale' 'ovalo' 'ovat' 'ovah' 'ovao' (<-'ova') 'avijemu' 'avijima' 'avijega' 'avijeg' 'avijem' 'avemu' 'avega' 'aveg' 'avem' 'avijim' 'avijih' 'avijoj' 'avoga' 'avome' 'avomu' 'avima' 'avama' 'aviji' 'avije' 'avija' 'aviju' 'avim' 'avih' 'avoj' 'avom' 'avog' 'avi' 'ava' 'avu' 'ave' 'avo' (<-'av') 'evijemu' 'evijima' 'evijega' 'evijeg' 'evijem' 'evemu' 'evega' 'eveg' 'evem' 'evijim' 'evijih' 'evijoj' 'evoga' 'evome' 'evomu' 'evima' 'evama' 'eviji' 'evije' 'evija' 'eviju' 'evim' 'evih' 'evoj' 'evom' 'evog' 'evi' 'eva' 'evu' 'eve' 'evo' (<-'ev') 'ivijemu' 'ivijima' 'ivijega' 'ivijeg' 'ivijem' 'ivemu' 'ivega' 'iveg' 'ivem' 'ivijim' 'ivijih' 'ivijoj' 'ivoga' 'ivome' 'ivomu' 'ivima' 'ivama' 'iviji' 'ivije' 'ivija' 'iviju' 'ivim' 'ivih' 'ivoj' 'ivom' 'ivog' 'ivi' 'iva' 'ivu' 'ive' 'ivo' (<-'iv') 'ovijemu' 'ovijima' 'ovijega' 'ovijeg' 'ovijem' 'ovemu' 'ovega' 'oveg' 'ovijim' 'ovijih' 'ovijoj' 'ovoga' 'ovome' 'ovomu' 'ovima' 'oviji' 'ovije' 'ovija' 'oviju' 'ovim' 'ovih' 'ovoj' 'ovom' 'ovog' 'ovi' 'ova' 'ovu' 'ove' 'ovo' (<-'ov') 'movima' 'movom' 'mova' 'movu' 'move' 'movi' (<-'mov') 'lovima' 'lovom' 'lova' 'lovu' 'love' 'lovi' (<-'lov') 'elijemu' 'elijima' 'elijega' 'elijeg' 'elijem' 'elemu' 'elega' 'eleg' 'elem' 'elijim' 'elijih' 'elijoj' 'eloga' 'elome' 'elomu' 'elima' 'eliji' 'elije' 'elija' 'eliju' 'elim' 'elih' 'eloj' 'elom' 'elog' 'eli' 'ela' 'elu' 'ele' 'elo' (<-'el') 'anjijemu' 'anjijima' 'anjijega' 'anjijeg' 'anjijem' 'anjemu' 'anjega' 'anjeg' 'anjem' 'anjijim' 'anjijih' 'anjijoj' 'anjoga' 'anjome' 'anjomu' 'anjima' 'anjiji' 'anjije' 'anjija' 'anjiju' 'anjim' 'anjih' 'anjoj' 'anjom' 'anjog' 'anja' 'anje' 'anji' 'anjo' 'anju' (<-'anj') 'enjijemu' 'enjijima' 'enjijega' 'enjijeg' 'enjijem' 'enjemu' 'enjega' 'enjeg' 'enjem' 'enjijim' 'enjijih' 'enjijoj' 'enjoga' 'enjome' 'enjomu' 'enjima' 'enjiji' 'enjije' 'enjija' 'enjiju' 'enjim' 'enjih' 'enjoj' 'enjom' 'enjog' 'enja' 'enje' 'enji' 'enjo' 'enju' (<-'enj') '{sv}njijemu' '{sv}njijima' '{sv}njijega' '{sv}njijeg' '{sv}njijem' '{sv}njemu' '{sv}njega' '{sv}njeg' '{sv}njem' '{sv}njijim' '{sv}njijih' '{sv}njijoj' '{sv}njoga' '{sv}njome' '{sv}njomu' '{sv}njima' '{sv}njiji' '{sv}njije' '{sv}njija' '{sv}njiju' '{sv}njim' '{sv}njih' '{sv}njoj' '{sv}njom' '{sv}njog' '{sv}nja' '{sv}nje' '{sv}nji' '{sv}njo' '{sv}nju' (<-'{sv}nj') 'anemu' 'anega' 'aneg' 'anem' (<-'an') 'enemu' 'enega' 'eneg' 'enem' (<-'en') '{sv}nemu' '{sv}nega' '{sv}neg' '{sv}nem' (<-'{sv}n') '{cv}inama' '{cv}inome' '{cv}inomu' '{cv}inoga' '{cv}inima' '{cv}inog' '{cv}inom' '{cv}inim' '{cv}inih' '{cv}inoj' '{cv}ina' '{cv}inu' '{cv}ini' '{cv}ino' '{cv}ine' (<-'{cv}in') 'ro{sv}iv{sv}i' 'ro{sv}ismo' 'ro{sv}iste' 'ro{sv}i{sv}e' 'ro{sv}imo' 'ro{sv}ite' 'ro{sv}iti' 'ro{sv}ili' 'ro{sv}ila' 'ro{sv}ilo' 'ro{sv}ile' 'ro{sv}im' 'ro{sv}i{sv}' 'ro{sv}it' 'ro{sv}ih' 'ro{sv}io' (<-'ro{sv}i') 'o{sv}ijemu' 'o{sv}ijima' 'o{sv}ijega' 'o{sv}ijeg' 'o{sv}ijem' 'o{sv}emu' 'o{sv}ega' 'o{sv}eg' 'o{sv}em' 'o{sv}ijim' 'o{sv}ijih' 'o{sv}ijoj' 'o{sv}oga' 'o{sv}ome' 'o{sv}omu' 'o{sv}ima' 'o{sv}iji' 'o{sv}ije' 'o{sv}ija' 'o{sv}iju' 'o{sv}im' 'o{sv}ih' 'o{sv}oj' 'o{sv}om' 'o{sv}og' 'o{sv}i' 'o{sv}a' 'o{sv}u' 'o{sv}e' (<-'o{sv}') 'evitijima' 'evitijega' 'evitijemu' 'evitijem' 'evitega' 'evitemu' 'evitem' 'evitijim' 'evitijih' 'evitijoj' 'evitijeg' 'evitiji' 'evitije' 'evitija' 'evitoga' 'evitome' 'evitomu' 'evitima' 'evitog' 'evitom' 'evitim' 'evitih' 'evitoj' 'eviti' 'evite' 'evito' 'evita' 'evitu' (<-'evit') 'ovitijima' 'ovitijega' 'ovitijemu' 'ovitijem' 'ovitega' 'ovitemu' 'ovitem' 'ovitijim' 'ovitijih' 'ovitijoj' 'ovitijeg' 'ovitiji' 'ovitije' 'ovitija' 'ovitoga' 'ovitome' 'ovitomu' 'ovitima' 'ovitog' 'ovitom' 'ovitim' 'ovitih' 'ovitoj' 'oviti' 'ovite' 'ovito' 'ovita' 'ovitu' (<-'ovit') 'astijima' 'astijega' 'astijemu' 'astijem' 'astega' 'astemu' 'astem' 'astijim' 'astijih' 'astijoj' 'astijeg' 'astiji' 'astije' 'astija' 'astoga' 'astome' 'astomu' 'astima' 'astog' 'astom' 'astim' 'astih' 'astoj' 'asti' 'aste' 'asto' 'asta' 'astu' (<-'ast') 'kijemu' 'kijima' 'kijega' 'kijeg' 'kijem' 'kemu' 'kega' 'keg' 'kem' 'kijim' 'kijih' 'kijoj' 'koga' 'kome' 'komu' 'kima' 'kiji' 'kije' 'kija' 'kiju' 'kim' 'kih' 'koj' 'kom' 'kog' 'kov' 'ki' 'ka' 'ku' 'ke' 'ko' (<-'k') 'evaju{c'}i' 'evasmo' 'evaste' 'evajmo' 'evajte' 'evaju' 'evala' 'evale' 'evali' 'evalo' 'evamo' 'evana' 'evane' 'evani' 'evano' 'evate' 'evati' 'eva{sv}e' 'evahu' 'evah' 'evaj' 'evam' 'evan' 'evao' 'evat' 'evav' 'eva{sv}' (<-'eva') 'avaju{c'}i' 'avasmo' 'avaste' 'avajmo' 'avajte' 'avaju' 'avala' 'avale' 'avali' 'avalo' 'avamo' 'avana' 'avane' 'avani' 'avano' 'avate' 'avati' 'ava{sv}e' 'avahu' 'avah' 'avaj' 'avam' 'avan' 'avao' 'avat' 'avav' 'ava{sv}' (<-'ava') 'ivaju{c'}i' 'ivasmo' 'ivaste' 'ivajmo' 'ivajte' 'ivaju' 'ivala' 'ivale' 'ivali' 'ivalo' 'ivamo' 'ivana' 'ivane' 'ivani' 'ivano' 'ivate' 'ivati' 'iva{sv}e' 'ivahu' 'ivah' 'ivaj' 'ivam' 'ivan' 'ivao' 'ivat' 'ivav' 'iva{sv}' (<-'iva') 'uvaju{c'}i' 'uvasmo' 'uvaste' 'uvajmo' 'uvajte' 'uvaju' 'uvala' 'uvale' 'uvali' 'uvalo' 'uvamo' 'uvana' 'uvane' 'uvani' 'uvano' 'uvate' 'uvati' 'uva{sv}e' 'uvahu' 'uvah' 'uvaj' 'uvam' 'uvan' 'uvao' 'uvat' 'uvav' 'uva{sv}' (<-'uva') 'irujemo' 'irujete' 'iruju{c'}i' 'iraju{c'}i' 'irivat' 'irujem' 'iruje{sv}' 'irujmo' 'irujte' 'irav{sv}i' 'irasmo' 'iraste' 'irati' 'iramo' 'irate' 'iraju' 'ira{sv}e' 'irahu' 'irala' 'iralo' 'irali' 'irale' 'iruje' 'iruju' 'iruj' 'iral' 'iran' 'iram' 'ira{sv}' 'irat' 'irah' 'irao' (<-'ir') 'a{cv}ismo' 'a{cv}iste' 'a{cv}iti' 'a{cv}imo' 'a{cv}ite' 'a{cv}i{sv}e' 'a{cv}e{c'}i' 'a{cv}ila' 'a{cv}ilo' 'a{cv}ili' 'a{cv}ile' 'a{cv}ena' 'a{cv}eno' 'a{cv}eni' 'a{cv}ene' 'a{cv}io' 'a{cv}im' 'a{cv}i{sv}' 'a{cv}it' 'a{cv}ih' 'a{cv}en' 'a{cv}i' 'a{cv}e' (<-'a{cv}') 'a{cv}av{sv}i' 'a{cv}asmo' 'a{cv}aste' 'a{cv}ahu' 'a{cv}ati' 'a{cv}amo' 'a{cv}ate' 'a{cv}a{sv}e' 'a{cv}ala' 'a{cv}alo' 'a{cv}ali' 'a{cv}ale' 'a{cv}aju' 'a{cv}ana' 'a{cv}ano' 'a{cv}ani' 'a{cv}ane' 'a{cv}ao' 'a{cv}am' 'a{cv}a{sv}' 'a{cv}at' 'a{cv}ah' 'a{cv}an' (<-'a{cv}a') 'nuv{sv}i' 'nusmo' 'nuste' 'nu{c'}i' 'nimo' 'nite' 'nemo' 'nete' 'nula' 'nulo' 'nule' 'nuli' 'nuto' 'nuti' 'nuta' 'ne{sv}' 'nuo' 'nut' (<-'n') 'niv{sv}i' 'nismo' 'niste' 'niti' 'nila' 'nilo' 'nile' 'nili' 'ni{sv}' 'nio' (<-'ni') 'aju{c'}i' 'av{sv}i' 'asmo' 'ajmo' 'ajte' 'ajem' 'aloj' 'amo' 'ate' 'aje' 'aju' 'ati' 'a{sv}e' 'ahu' 'ala' 'ali' 'ale' 'alo' 'ano' 'at' 'ah' 'ao' 'aj' 'an' 'am' 'a{sv}' (<-'a') 'uraju{c'}i' 'urasmo' 'uraste' 'urajmo' 'urajte' 'uramo' 'urate' 'uraju' 'urati' 'ura{sv}e' 'urahu' 'urala' 'urali' 'urale' 'uralo' 'urana' 'urano' 'urani' 'urane' 'ural' 'urat' 'urah' 'urao' 'uraj' 'uran' 'uram' 'ura{sv}' (<-'ur') 'astajasmo' 'astajaste' 'astajahu' 'astajati' 'astajemo' 'astajete' 'astaja{sv}e' 'astajali' 'astaju{c'}i' 'astajala' 'astajalo' 'astajale' 'astajmo' 'astajao' 'astajem' 'astaje{sv}' 'astajat' 'astajah' 'astajte' 'astaje' 'astaju' (<-'astaj') 'istajasmo' 'istajaste' 'istajahu' 'istajati' 'istajemo' 'istajete' 'istaja{sv}e' 'istajali' 'istaju{c'}i' 'istajala' 'istajalo' 'istajale' 'istajmo' 'istajao' 'istajem' 'istaje{sv}' 'istajat' 'istajah' 'istajte' 'istaje' 'istaju' (<-'istaj') 'ostajasmo' 'ostajaste' 'ostajahu' 'ostajati' 'ostajemo' 'ostajete' 'ostaja{sv}e' 'ostajali' 'ostaju{c'}i' 'ostajala' 'ostajalo' 'ostajale' 'ostajmo' 'ostajao' 'ostajem' 'ostaje{sv}' 'ostajat' 'ostajah' 'ostajte' 'ostaje' 'ostaju' (<-'ostaj') 'alama' 'alima' 'alom' 'alu' 'al' (<-'a') 'ajevima' 'ajevi' 'ajeva' 'ajeve' 'ajama' 'ajima' 'aja' 'aji' (<-'aj') 'astadosmo' 'astadoste' 'astado{sv}e' 'astanemo' 'astademo' 'astanete' 'astadete' 'astanimo' 'astanite' 'astanila' 'astav{sv}i' 'astanem' 'astadem' 'astane{sv}' 'astade{sv}' 'astadoh' 'astade' 'astati' 'astane' 'astanu' 'astadu' 'astala' 'astali' 'astalo' 'astale' 'astat' 'astao' (<-'asta') 'istadosmo' 'istadoste' 'istado{sv}e' 'istanemo' 'istademo' 'istanete' 'istadete' 'istanimo' 'istanite' 'istanila' 'istav{sv}i' 'istanem' 'istadem' 'istane{sv}' 'istade{sv}' 'istadoh' 'istade' 'istati' 'istane' 'istanu' 'istadu' 'istala' 'istali' 'istalo' 'istale' 'istat' 'istao' (<-'ista') 'ostadosmo' 'ostadoste' 'ostado{sv}e' 'ostanemo' 'ostademo' 'ostanete' 'ostadete' 'ostanimo' 'ostanite' 'ostanila' 'ostav{sv}i' 'ostanem' 'ostadem' 'ostane{sv}' 'ostade{sv}' 'ostadoh' 'ostade' 'ostati' 'ostane' 'ostanu' 'ostadu' 'ostala' 'ostali' 'ostalo' 'ostale' 'ostat' 'ostao' (<-'osta') 'tasmo' 'taste' 'tajmo' 'tajte' 'tav{sv}i' 'tati' 'tamo' 'tate' 'taju' 'tala' 'talo' 'tale' 'tali' 'tana' 'tano' 'tani' 'tane' 'tan' 'taj' 'tao' 'tam' 'ta{sv}' 'tat' 'tah' (<-'ta') 'injasmo' 'injaste' 'injati' 'injemo' 'injete' 'injali' 'injala' 'injalo' 'injale' 'inja{sv}e' 'injahu' 'injem' 'inje{sv}' 'injat' 'injah' 'injao' (<-'inj') 'astemo' 'astete' 'astimo' 'astite' 'astu{c'}i' 'aste{sv}' 'asli' 'asla' 'aslo' 'asle' (<-'as') 'iv{sv}i' 'ie{c'}i' 'ismo' 'imo' 'ite' 'iti' 'ili' 'ila' 'ilo' 'ile' 'im' 'i{sv}' 'it' 'ih' 'io' (<-'i') 'ijemo' 'ijete' 'ijem' 'ije{sv}' 'ijmo' 'ijte' 'iju' 'ije' 'ij' 'ilu' (<-'i') 'lu{cv}ujete' 'lu{cv}uju{c'}i' 'lu{cv}ujemo' 'lu{cv}ujem' 'lu{cv}uje{sv}' 'lu{cv}ismo' 'lu{cv}iste' 'lu{cv}ujmo' 'lu{cv}ujte' 'lu{cv}uje' 'lu{cv}uju' 'lu{cv}i{sv}e' 'lu{cv}iti' 'lu{cv}imo' 'lu{cv}ite' 'lu{cv}ila' 'lu{cv}ilo' 'lu{cv}ili' 'lu{cv}ile' 'lu{cv}ena' 'lu{cv}eno' 'lu{cv}eni' 'lu{cv}ene' 'lu{cv}uj' 'lu{cv}io' 'lu{cv}en' 'lu{cv}im' 'lu{cv}i{sv}' 'lu{cv}it' 'lu{cv}ih' 'lu{cv}e' 'lu{cv}i' (<-'lu{cv}') 'jetismo' 'jetiste' 'jeti{sv}e' 'jetimo' 'jetite' 'jetiti' 'jetili' 'jetila' 'jetilo' 'jetile' 'jetim' 'jeti{sv}' 'jetit' 'jetih' 'jetio' (<-'jeti') 'emo' 'em' 'e{sv}' 'elama' 'el' (<-'e') 'ilama' 'ilima' 'ilom' 'il' (<-'i') 'atijega' 'atijemu' 'atijima' 'atijeg' 'atijem' 'atega' 'atemu' 'ateg' 'atem' 'atijih' 'atijim' 'atima' 'atoga' 'atome' 'atomu' 'atiji' 'atije' 'atija' 'atiju' 'atoj' 'atog' 'atom' 'atim' 'atih' 'ata' 'atu' 'ato' (<-'at') 'etav{sv}i' 'etu{c'}i' 'etemo' 'etimo' 'etem' 'ete{sv}' (<-'et') 'lucujuci' 'lucujemo' 'lucujete' 'lucujem' 'lucujes' 'lucujmo' 'lucujte' 'lucismo' 'luciste' 'luciti' 'lucite' 'lucise' 'lucuje' 'lucuju' 'lucila' 'lucile' 'lucili' 'lucilo' 'lucena' 'luceni' 'lucene' 'luceno' 'lucimo' 'lucim' 'lucis' 'lucih' 'lucit' 'lucio' 'lucuj' 'lucen' 'luce' 'luci' (no_diacritics <-'luc') 'snjijima' 'snjijemu' 'snjijega' 'snjijim' 'snjijih' 'snjijeg' 'snjijoj' 'snjiji' 'snjija' 'snjije' 'snjiju' 'snjima' 'snjemu' 'snjomu' 'snjome' 'snjega' 'snjoga' 'snjih' 'snjim' 'snjem' 'snjom' 'snjeg' 'snjog' 'snjoj' 'snja' 'snje' 'snji' 'snjo' 'snju' (no_diacritics <-'snj') 'osijima' 'osijemu' 'osijega' 'snjijem' 'osijih' 'osijim' 'osijem' 'osijeg' 'osijoj' 'osima' 'osemu' 'osomu' 'osome' 'osega' 'osoga' 'osija' 'osije' 'osiji' 'osiju' 'osih' 'osim' 'osem' 'osom' 'oseg' 'osog' 'osoj' 'osa' 'ose' 'osi' 'osu' (no_diacritics <-'os') 'acismo' 'aciste' 'acima' 'acimo' 'acome' 'acomu' 'acite' 'aciti' 'acise' 'acila' 'acile' 'acili' 'acilo' 'acega' 'acene' 'aceci' 'aceni' 'acemu' 'acena' 'aceno' 'acoga' 'acoj' 'acih' 'acem' 'acom' 'acen' 'acog' 'acit' 'acio' 'aceg' 'acim' 'acuh' 'acis' 'ace' 'aca' 'aci' (no_diacritics <-'ac') 'ecome' 'ecoga' 'ecemu' 'ecima' 'ecega' 'ecomu' 'ecoj' 'ecuh' 'ecom' 'ecog' 'eceg' 'ecih' 'ecem' 'ecim' 'eca' 'ece' (no_diacritics <-'ec') 'ucomu' 'ucome' 'ucima' 'ucoga' 'ucega' 'ucemu' 'ucih' 'ucog' 'uceg' 'ucom' 'ucem' 'ucim' 'ucuh' 'ucoj' 'uca' 'uce' (no_diacritics <-'uc') 'rosismo' 'rosivsi' 'rosiste' 'rositi' 'rosili' 'rosise' 'rosite' 'rosilo' 'rosimo' 'rosile' 'rosila' 'rosit' 'rosis' 'rosio' 'rosim' 'rosih' (no_diacritics <-'rosi') 'acavsi' 'acaste' 'acasmo' 'acaju' 'acane' 'acate' 'acali' 'acani' 'acati' 'acale' 'acahu' 'acase' 'acano' 'acamo' 'acalo' 'acana' 'acala' 'acam' 'acan' 'acao' 'acas' 'acat' 'acah' (no_diacritics <-'aca') 'jasima' 'jasama' 'jasem' 'jasom' 'jase' 'jasi' 'jasa' 'jasu' (no_diacritics <-'jas') 'tasima' 'tasama' 'tasem' 'tasom' 'tase' 'tasa' 'tasu' 'tasi' (no_diacritics <-'tas') 'gasima' 'gasama' 'gasem' 'gasom' 'gasi' 'gasu' 'gase' 'gasa' (no_diacritics <-'gas') 'nasama' 'nasima' 'nasem' 'nasom' 'nasu' 'nasi' 'nase' 'nasa' (no_diacritics <-'nas') 'kasama' 'kasima' 'kasom' 'kasem' 'kasi' 'kasu' 'kase' 'kasa' (no_diacritics <-'kas') 'vasama' 'vasima' 'vasom' 'vasem' 'vasi' 'vase' 'vasa' 'vasu' (no_diacritics <-'vas') 'basama' 'basima' 'basom' 'basem' 'basi' 'base' 'basu' 'basa' (no_diacritics <-'bas') 'astuci' 'astes' (no_diacritics <-'as') 'cinima' 'cinome' 'cinama' 'cinomu' 'cinoga' 'cinom' 'cinih' 'cinim' 'cinog' 'cinoj' 'cino' 'cini' 'cinu' 'cine' 'cina' (no_diacritics <-'cin') 'astajase' 'astajuci' 'astajes' (no_diacritics <-'astaj') 'istajase' 'istajuci' 'istajes' (no_diacritics <-'istaj') 'ostajase' 'ostajuci' 'ostajes' (no_diacritics <-'ostaj') 'astadose' 'astades' 'astanes' 'astavsi' (no_diacritics <-'asta') 'istadose' 'istades' 'istanes' 'istavsi' (no_diacritics <-'ista') 'ostadose' 'ostades' 'ostanes' 'ostavsi' (no_diacritics <-'osta') 'avajuci' 'avase' 'avas' (no_diacritics <-'ava') 'evajuci' 'evase' 'evas' (no_diacritics <-'eva') 'ivajuci' 'ivase' 'ivas' (no_diacritics <-'iva') 'uvajuci' 'uvase' 'uvas' (no_diacritics <-'uva') 'ovase' (no_diacritics <-'ova') 'jetise' 'jetis' (no_diacritics <-'jeti') 'injase' 'injes' (no_diacritics <-'inj') 'istem' (no_diacritics <-'ist') 'esama' 'esem' 'esi' (no_diacritics <-'es') 'etavsi' 'etuci' 'etes' (no_diacritics <-'et') 'isama' 'isem' 'isi' (no_diacritics <-'is') 'irajuci' 'irujuci' 'irujes' 'iravsi' 'irase' 'iras' (no_diacritics <-'ir') 'urajuci' 'urase' 'uras' (no_diacritics <-'ur') 'ujuci' 'ujes' (no_diacritics <-'uj') 'nivsi' 'nis' (no_diacritics <-'ni') 'snega' 'snemu' 'snem' 'sneg' (no_diacritics <-'sn') 'tavsi' 'tas' (no_diacritics <-'ta') 'ajuci' 'avsi' 'ase' 'as' (no_diacritics <-'a') 'ijes' 'ivsi' 'ieci' 'is' (no_diacritics <-'i') 'es' (no_diacritics <-'e') 'nuvsi' 'nuci' 'nes' (no_diacritics <-'n') ) ) define Step_3 as ( [substring] R1 among ( 'enom' 'enoj' 'enog' 'enim' 'enih' 'anoj' 'anog' 'anim' 'anih' 'ost' 'eno' 'eni' 'oga' 'ima' 'enu' 'ena' 'ama' 'ano' 'ani' 'om' 'og' 'u' 'o' 'i' 'e' 'a' (<-'') ) ) ) define stem as ( do cyr_to_lat do prelude do mark_regions backwards ( do Step_1 do (Step_2 or Step_3) ) ) snowball-3.1.0/algorithms/sesotho.sbl000066400000000000000000000057311520373054300177030ustar00rootroot00000000000000/* stringescapes UTF-8 */ /* Sesotho stemmer for the Snowball project ---------------------------------------- Author: Kamohelo Lebjane Purpose: To reduce Sesotho words to their morphological stems. Language notes: Sesotho (Southern Sotho) is an agglutinative Bantu language. Words often contain prefixes for noun classes and suffixes for tense, aspect, or derivation. An agglutinative language is a type of language that primarily forms words by stringing together morphemes (word parts)—each typically representing a single grammatical meaning—without significant modification to their forms (agglutinations). In such languages, affixes (prefixes, suffixes, infixes, or circumfixes) are added to a root word in a linear and systematic way, creating complex words that encode detailed grammatical information. Examples: baruti -> rut (root) moruti -> rut (root) rutile -> rut (root) The rules below remove common noun class prefixes and common verb suffixes, keeping the main root form. */ /* --- Routine declarations --- */ routines ( mark_regions remove_noun_prefixes remove_verb_suffixes remove_nominal_suffixes ) /* --- External declarations --- */ externals ( stem ) /* --- Groupings --- */ groupings ( v ) /* --- Character sets --- */ define v 'aeiou' /* --- Integer for tracking position --- */ integers ( pV ) /* --- Mark vowel region --- */ define mark_regions as ( // Set pV after the first vowel and at least 2 characters into the string. // Signals f if the string doesn't contain a vowel or is shorter than 2 // characters. test (gopast v setmark pV) test (hop 2 do ($(cursor > pV) setmark pV)) ) /* --- Remove noun class prefixes --- */ define remove_noun_prefixes as ( [substring] among( 'mo' 'ba' 'me' 'le' 'ma' 'se' 'boi' 'li' ) /* Require at least two characters remain */ test (next not atlimit) /* Only delete if there's a vowel after the cursor position */ gopast v delete ) backwardmode ( /* --- Remove verb suffixes (from end of word) --- */ define remove_verb_suffixes as ( setlimit tomark pV for ( [substring] among( 'ile' /* perfect tense */ 'isa' /* causative */ 'etse' /* applicative */ 'ela' /* applicative */ 'ang' /* plural imperative */ 'ong' /* continuous/derived form */ 'eng' 'wa' /* passive */ 'a' /* infinitive marker */ (delete) ) ) ) /* --- Remove nominal suffixes --- */ define remove_nominal_suffixes as ( setlimit tomark pV for ( [substring] among( 'nyana' /* diminutive form */ 'ana' /* diminutive form */ 'ano' 'oa' 'i' (delete) ) ) ) ) /* --- MAIN STEMMER --- */ define stem as ( mark_regions // Signals f if the string is too short to stem. backwards ( do remove_nominal_suffixes do remove_verb_suffixes ) do remove_noun_prefixes ) snowball-3.1.0/algorithms/spanish.sbl000066400000000000000000000134071520373054300176630ustar00rootroot00000000000000routines ( postlude mark_regions RV R1 R2 attached_pronoun standard_suffix y_verb_suffix verb_suffix residual_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' // a-acute stringdef e' '{U+00E9}' // e-acute stringdef i' '{U+00ED}' // i-acute stringdef o' '{U+00F3}' // o-acute stringdef u' '{U+00FA}' // u-acute stringdef u" '{U+00FC}' // u-diaeresis stringdef n~ '{U+00F1}' // n-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( '{a'}' (<- 'a') '{e'}' (<- 'e') '{i'}' (<- 'i') '{o'}' (<- 'o') '{u'}' (<- 'u') // and possibly {u"}->u here, or in prelude '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among( 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' 'las' 'les' 'los' 'nos' ) substring RV among( 'i{e'}ndo' (] <- 'iendo') '{a'}ndo' (] <- 'ando') '{a'}r' (] <- 'ar') '{e'}r' (] <- 'er') '{i'}r' (] <- 'ir') 'ando' 'iendo' 'ar' 'er' 'ir' (delete) 'yendo' ('u' delete) ) ) define standard_suffix as ( [substring] among( 'anza' 'anzas' 'ico' 'ica' 'icos' 'icas' 'ismo' 'ismos' 'able' 'ables' 'ible' 'ibles' 'ista' 'istas' 'oso' 'osa' 'osos' 'osas' 'amiento' 'amientos' 'imiento' 'imientos' ( R2 delete ) 'adora' 'ador' 'aci{o'}n' 'adoras' 'adores' 'aciones' 'ante' 'antes' 'ancia' 'ancias' 'acion' // Misspelling of '-ación'. ( R2 delete try ( ['ic'] R2 delete ) ) 'log{i'}a' 'log{i'}as' ( R2 <- 'log' ) 'uci{o'}n' 'uciones' 'ucion' // Misspelling of '-ución'. ( R2 <- 'u' ) 'encia' 'encias' ( R2 <- 'ente' ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' (['at'] R2 delete) 'os' 'ic' 'ad' ) ) ) 'mente' ( R2 delete try ( [substring] among( 'ante' 'able' 'ible' (R2 delete) ) ) ) 'idad' 'idades' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'iva' 'ivo' 'ivas' 'ivos' ( R2 delete try ( ['at'] R2 delete // but not a further ['ic'] R2 delete ) ) ) ) define y_verb_suffix as ( setlimit tomark pV for ([substring]) among( 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' 'yas' 'yes' 'yais' 'yamos' ('u' delete) ) ) define verb_suffix as ( setlimit tomark pV for ([substring]) among( 'en' 'es' '{e'}is' 'emos' (try ('u' test 'g') ] delete) 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' 'ar{e'}' 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' 'er{e'}' 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' 'ir{e'}' 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' (delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'o' '{a'}' '{i'}' '{o'}' ( RV delete ) 'e' '{e'}' ( RV delete try( ['u'] test 'g' RV delete ) ) ) ) ) define stem as ( do mark_regions backwards ( do attached_pronoun do ( standard_suffix or y_verb_suffix or verb_suffix ) do residual_suffix ) do postlude ) snowball-3.1.0/algorithms/swedish.sbl000066400000000000000000000057731520373054300176730ustar00rootroot00000000000000routines ( et_condition mark_regions main_suffix consonant_pair other_suffix ) externals ( stem ) integers ( p1 x ) groupings ( v s_ending ost_ending ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef ao '{U+00E5}' stringdef o" '{U+00F6}' define v 'aeiouy{a"}{ao}{o"}' define s_ending 'bcdfghjklmnoprtvy' define ost_ending 'iklnprtuv' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) gopast v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define et_condition as ( (non-v v not atlimit) and not among ( // frihet, nyhet, råhet, trohet 'h' // societet 'iet' // annuitet, kontinuitet 'uit' // alfabet 'fab' // autenticitet, elektricitet, kapacitet, metallicitet, publicitet 'cit' // graviditet, likviditet, rigiditet 'dit' // neutralitet, rivalitet, sexualitet 'alit' // flexibilitet, instabilitet, kompatibilitet, mobilitet, variabilitet 'ilit' // anonymitet, intimitet, legitimitet 'mit' // kommunitet, maskulinitet, modernitet, spontanitet, suveränitet 'nit' // epitet, serendipitet 'pit' // auktoritet, integritet, majoritet, popularitet, prioritet 'rit' // densitet, generositet, intensitet, luminositet, viskositet 'sit' // identitet, kvantitet 'tit' // aggressivitet, positivitet 'ivit' // antikvitet, oblikvitet 'kvit' // komplexitet 'xit' // komet 'kom' // raket 'rak' // paket 'pak' // staket 'stak' ) ) define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' 'hetens' 'erns' 'at' 'andet' 'het' 'ast' (delete) 's' ( ('et' et_condition ]) or s_ending delete ) 'et' ( et_condition delete ) ) ) define consonant_pair as setlimit tomark p1 for ( among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') and ([next] delete) ) define other_suffix as ( setlimit tomark p1 for ([substring]) among( 'lig' 'ig' 'els' (delete) '{o"}st' (ost_ending <-'{o"}s') 'fullt' (<-'full') ) ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix ) ) snowball-3.1.0/algorithms/tamil.sbl000066400000000000000000000247631520373054300173330ustar00rootroot00000000000000/* * Affix stripping stemming algorithm for Tamil * By Damodharan Rajalingam */ stringescapes {} /* Aytham */ stringdef aytham '{U+0B83}' /* Uyir - independent vowels */ stringdef a '{U+0B85}' stringdef aa '{U+0B86}' stringdef i '{U+0B87}' stringdef ii '{U+0B88}' stringdef u '{U+0B89}' stringdef uu '{U+0B8A}' stringdef e '{U+0B8E}' stringdef ee '{U+0B8F}' stringdef ai '{U+0B90}' stringdef o '{U+0B92}' stringdef oo '{U+0B93}' stringdef au '{U+0B94}' /* Consonants */ stringdef ka '{U+0B95}' stringdef nga '{U+0B99}' stringdef ca '{U+0B9A}' stringdef ja '{U+0B9C}' stringdef nya '{U+0B9E}' stringdef tta '{U+0B9F}' stringdef nna '{U+0BA3}' stringdef ta '{U+0BA4}' stringdef tha '{U+0BA4}' stringdef na '{U+0BA8}' stringdef nnna '{U+0BA9}' stringdef pa '{U+0BAA}' stringdef ma '{U+0BAE}' stringdef ya '{U+0BAF}' stringdef ra '{U+0BB0}' stringdef rra '{U+0BB1}' stringdef la '{U+0BB2}' stringdef lla '{U+0BB3}' stringdef llla '{U+0BB4}' stringdef zha '{U+0BB4}' stringdef va '{U+0BB5}' /* Vatamozi - borrowed */ stringdef sha '{U+0BB6}' stringdef ssa '{U+0BB7}' stringdef sa '{U+0BB8}' stringdef ha '{U+0BB9}' /* Dependent vowel signs (kombu etc.) */ stringdef vs_aa '{U+0BBE}' stringdef vs_i '{U+0BBF}' stringdef vs_ii '{U+0BC0}' stringdef vs_u '{U+0BC1}' stringdef vs_uu '{U+0BC2}' stringdef vs_e '{U+0BC6}' stringdef vs_ee '{U+0BC7}' stringdef vs_ai '{U+0BC8}' stringdef vs_o '{U+0BCA}' stringdef vs_oo '{U+0BCB}' stringdef vs_au '{U+0BCC}' /* Pulli */ stringdef pulli '{U+0BCD}' /* AU length mark */ stringdef au_lmark '{U+0BD7}' routines ( remove_plural_suffix remove_question_suffixes remove_question_prefixes remove_pronoun_prefixes remove_command_suffixes remove_um remove_vetrumai_urupukal fix_va_start fix_ending fix_endings remove_tense_suffix remove_tense_suffixes remove_common_word_endings has_min_length ) externals ( stem ) booleans ( found_a_match found_vetrumai_urupu ) define has_min_length as ( $(len > 4) ) define fix_va_start as ( [substring] among ( '{va}{vs_oo}' ( <- '{oo}' ) '{va}{vs_o}' ( <- '{o}' ) '{va}{vs_u}' ( <- '{u}' ) '{va}{vs_uu}' ( <- '{uu}' ) ) ) define fix_endings as ( do repeat fix_ending ) define remove_question_prefixes as ( [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete do fix_va_start ) // Gives signal t if an ending was fixed, signal f otherwise. define fix_ending as ( $(len > 3) backwards ( ( [substring] among ( '{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}' ( delete ) '{ya}{pulli}' ( test among('{vs_ai}' '{vs_i}' '{vs_ii}') delete ) '{tta}{pulli}{pa}{pulli}' '{tta}{pulli}{ka}{pulli}' ( <- '{lla}{pulli}' ) '{nnna}{pulli}{rra}{pulli}' ( <- '{la}{pulli}' ) '{rra}{pulli}{ka}{pulli}' // '{nnna}{pulli}{nnna}{pulli}' ( <- '{la}{pulli}' ) '{tta}{pulli}{tta}{pulli}' ( <- '{tta}{vs_u}' ) '{ta}{pulli}{ta}{pulli}' ( found_vetrumai_urupu not '{vs_ai}' <- '{ma}{pulli}' ) '{vs_u}{ka}{pulli}' '{vs_u}{ka}{pulli}{ka}{pulli}' ( <- '{pulli}' ) '{va}' '{ya}' '{va}{pulli}' ( delete ) '{nnna}{vs_u}' ( not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') delete ) '{nga}{pulli}' ( among ( '{vs_ai}' ( delete ) '{pulli}' ( delete ) '' ( <- '{ma}{pulli}' ) ) ) ) ) or ( [ '{pulli}' ( ( among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') try ( '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ) ] delete ) or ( among( '{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}') ] '{pulli}' delete ) or ( test among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}' '{pulli}') ] delete ) ) ) ) ) define remove_pronoun_prefixes as ( [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete do fix_va_start ) define remove_plural_suffix as ( backwards ( [substring] among ( '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' ( ( among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') <- '{vs_u}{nga}{pulli}' ) or <- '{pulli}' ) '{rra}{pulli}{ka}{lla}{pulli}' ( <- '{la}{pulli}' ) '{tta}{pulli}{ka}{lla}{pulli}' ( <- '{lla}{pulli}' ) '{ka}{lla}{pulli}' ( delete ) ) ) ) define remove_question_suffixes as ( has_min_length backwards ( do ( [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}' ) ) do fix_endings ) define remove_command_suffixes as ( has_min_length backwards ( [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete ) ) define remove_um as ( has_min_length backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}' ) do fix_ending ) define remove_common_word_endings as ( // These are not suffixes actually but are // some words that are attached to other words // but can be removed for stemming has_min_length backwards ( [substring] among ( '{vs_u}{tta}{nnna}{pulli}' '{vs_i}{la}{pulli}{la}{vs_ai}' '{vs_i}{tta}{ma}{pulli}' '{vs_i}{nnna}{pulli}{rra}{vs_i}' '{vs_aa}{ka}{vs_i}' '{vs_aa}{ka}{vs_i}{ya}' '{vs_e}{nnna}{pulli}{rra}{vs_u}' '{vs_u}{lla}{pulli}{lla}' '{vs_u}{tta}{vs_ai}{ya}' '{vs_u}{tta}{vs_ai}' '{vs_e}{nnna}{vs_u}{ma}{pulli}' '{vs_e}{nnna}' ( <- '{pulli}' ) '{la}{pulli}{la}' ( not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') <- '{pulli}' ) '{pa}{tta}{vs_u}' '{pa}{tta}{pulli}{tta}' '{pa}{tta}{pulli}{tta}{vs_u}' '{pa}{tta}{pulli}{tta}{ta}{vs_u}' '{pa}{tta}{pulli}{tta}{nna}' '{ka}{vs_u}{ra}{vs_i}{ya}' '{pa}{rra}{pulli}{rra}{vs_i}' '{va}{vs_i}{tta}{vs_u}' '{va}{vs_i}{tta}{pulli}{tta}{vs_u}' '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}' '{pa}{tta}{vs_i}' '{ta}{vs_aa}{nnna}' '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}' ( delete ) ) ) do fix_endings ) define remove_vetrumai_urupukal as ( unset found_vetrumai_urupu has_min_length backwards ( ( test ( [substring] among ( '{nnna}{vs_ai}' ( delete ) '{vs_o}{tta}{vs_u}' '{vs_oo}{tta}{vs_u}' '{vs_i}{la}{pulli}' '{vs_i}{rra}{pulli}' '{vs_i}{nnna}{pulli}{rra}{vs_u}' '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' '{va}{vs_i}{tta}' '{vs_aa}{la}{pulli}' '{vs_u}{tta}{vs_ai}' '{vs_aa}{ma}{la}{pulli}' '{vs_u}{lla}{pulli}' ( <- '{pulli}' ) '{vs_i}{nnna}{pulli}' ( not '{ma}' <- '{pulli}' ) '{vs_i}{tta}{ma}{pulli}' ( $(len >= 7) <- '{pulli}' ) '{la}{pulli}' ( not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') <- '{pulli}' ) '{ka}{nna}{pulli}' '{ma}{vs_u}{nnna}{pulli}' '{ma}{vs_ee}{la}{pulli}' '{ma}{vs_ee}{rra}{pulli}' '{ka}{vs_ii}{llla}{pulli}' (delete) '{ta}{vs_u}' ( not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') delete ) '{vs_ii}' ( <- '{vs_i}' ) ) ) or test ( [ '{vs_ai}' ( (not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) or (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')) ) ] <- '{pulli}' ) ) (set found_vetrumai_urupu) do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' ) ) do fix_endings ) define remove_tense_suffixes as ( repeat remove_tense_suffix ) // Gives signal t if a tense suffix was removed, signal f otherwise. define remove_tense_suffix as ( unset found_a_match has_min_length backwards ( do ( test ( [substring] among ( '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}' '{pa}{tta}{vs_u}' '{ma}{vs_aa}{ra}{pulli}' '{ma}{vs_i}{nnna}{pulli}' '{nnna}{nnna}{pulli}' '{nnna}{vs_aa}{nnna}{pulli}' '{nnna}{vs_aa}{lla}{pulli}' '{nnna}{vs_aa}{ra}{pulli}' '{nnna}{lla}{pulli}' '{va}{lla}{pulli}' '{nnna}{ra}{pulli}' '{va}{ra}{pulli}' '{nnna}' '{pa}' '{ka}' '{ta}' '{ya}' '{pa}{nnna}{pulli}' '{pa}{lla}{pulli}' '{pa}{ra}{pulli}' '{vs_i}{rra}{pulli}{rra}{vs_u}' '{pa}{ma}{pulli}' '{nnna}{ma}{pulli}' '{ta}{vs_u}{ma}{pulli}' '{rra}{vs_u}{ma}{pulli}' '{ka}{vs_u}{ma}{pulli}' '{nnna}{vs_e}{nnna}{pulli}' '{nnna}{vs_ai}' '{va}{vs_ai}' ( delete ) '{va}{nnna}{pulli}' ( not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}') delete ) '{ta}{vs_u}' ( not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') delete ) '{vs_aa}{nnna}{pulli}' ( not '{ca}' <- '{pulli}' ) '{vs_aa}{lla}{pulli}' '{vs_aa}{ra}{pulli}' '{vs_ee}{nnna}{pulli}' '{vs_aa}' '{vs_aa}{ma}{pulli}' '{vs_e}{ma}{pulli}' '{vs_ee}{ma}{pulli}' '{vs_oo}{ma}{pulli}' '{tta}{vs_u}{ma}{pulli}' '{vs_aa}{ya}{pulli}' '{nnna}{vs_i}{ra}{pulli}' '{vs_ii}{ra}{pulli}' '{vs_ii}{ya}{ra}{pulli}' ( <- '{pulli}' ) '{ka}{vs_u}' ( test '{pulli}' delete ) ) (set found_a_match) ) ) do ([among( '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}' '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}' '{ka}{vs_i}{nnna}{pulli}{rra}' '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}' '{ka}{vs_i}{rra}' '{ka}{vs_i}{rra}{pulli}' )] delete (set found_a_match) ) ) do fix_endings found_a_match ) define stem as ( unset found_vetrumai_urupu do fix_ending has_min_length do remove_question_prefixes do remove_pronoun_prefixes do remove_question_suffixes do remove_um do remove_common_word_endings do remove_vetrumai_urupukal do remove_plural_suffix do remove_command_suffixes do remove_tense_suffixes ) snowball-3.1.0/algorithms/turkish.sbl000066400000000000000000000334201520373054300177040ustar00rootroot00000000000000/* Stemmer for Turkish * author: Evren (Kapusuz) Çilden * email: evren.kapusuz at gmail.com * * stems nominal verb suffixes * stems nominal inflections * more than one syllable word check * (y,n,s,U) context check * vowel harmony check * last consonant check and conversion (b, c, d, ğ to p, ç, t, k) * * The stemming algorithm is based on the paper "An Affix Stripping * Morphological Analyzer for Turkish" by Gülşen Eryiğit and * Eşref Adalı (Proceedings of the IAESTED International Conference * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, * Innsbruck, Austria * * Turkish is an agglutinative language and has a very rich morphological * structure. In Turkish, you can form many different words from a single stem * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means * "You had been the doctor of him". The stem of the word is "doktor" and it * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about * the append order of suffixes can be clearly described as FSMs. * The paper referenced above defines some FSMs for right to left * morphological analysis. I generated a method for constructing snowball * expressions from right to left FSMs for stemming suffixes. */ routines ( append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings check_vowel_harmony // tests vowel harmony for suffixes is_reserved_word // tests whether current string is a reserved word ('ad','soyad') mark_cAsInA // nominal verb suffix mark_DA // noun suffix mark_DAn // noun suffix mark_DUr // nominal verb suffix mark_ki // noun suffix mark_lAr // noun suffix, nominal verb suffix mark_lArI // noun suffix mark_nA // noun suffix mark_ncA // noun suffix mark_ndA // noun suffix mark_ndAn // noun suffix mark_nU // noun suffix mark_nUn // noun suffix mark_nUz // nominal verb suffix mark_sU // noun suffix mark_sUn // nominal verb suffix mark_sUnUz // nominal verb suffix mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz, mark_yA // noun suffix mark_ylA // noun suffix mark_yU // noun suffix mark_yUm // nominal verb suffix mark_yUz // nominal verb suffix mark_yDU // nominal verb suffix mark_yken // nominal verb suffix mark_ymUs_ // nominal verb suffix mark_ysA // nominal verb suffix mark_suffix_with_optional_y_consonant mark_suffix_with_optional_U_vowel mark_suffix_with_optional_n_consonant mark_suffix_with_optional_s_consonant more_than_one_syllable_word post_process_last_consonants postlude remove_proper_noun_suffix stem_nominal_verb_suffixes stem_noun_suffixes stem_suffix_chain_before_ki ) stringescapes { } /* Special characters in Unicode Latin-1 and Latin Extended-A */ stringdef cc '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE stringdef i '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS stringdef sc '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS booleans ( continue_stemming_noun_suffixes ) groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6 ) define vowel 'ae{i}io{o"}u{u"}' define U '{i}iu{u"}' // the vowel grouping definitions below are used for checking vowel harmony define vowel1 'a{i}ou' // vowels that can end with suffixes containing 'a' define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' define vowel3 'a{i}' // vowels that can end with suffixes containing '{i}' define vowel4 'ei' // vowels that can end with suffixes containing 'i' define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing '{o"}' or '{u"}' externals ( stem ) backwardmode ( // checks vowel harmony for possible suffixes, // helps to detect whether the candidate for suffix applies to vowel harmony // this rule is added to prevent over stemming define check_vowel_harmony as ( test ( (goto vowel) // if there is a vowel ( ('a' goto vowel1) or ('e' goto vowel2) or ('{i}' goto vowel3) or ('i' goto vowel4) or ('o' goto vowel5) or ('{o"}' goto vowel6) or ('u' goto vowel5) or ('{u"}' goto vowel6) ) ) ) // if the last consonant before suffix is vowel and n then advance and delete // if the last consonant before suffix is non vowel and n do nothing // if the last consonant before suffix is not n then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_n_consonant as ( ('n' (test vowel)) or ((not 'n') test(next vowel)) ) // if the last consonant before suffix is vowel and s then advance and delete // if the last consonant before suffix is non vowel and s do nothing // if the last consonant before suffix is not s then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_s_consonant as ( ('s' (test vowel)) or ((not 's') test(next vowel)) ) // if the last consonant before suffix is vowel and y then advance and delete // if the last consonant before suffix is non vowel and y do nothing // if the last consonant before suffix is not y then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_y_consonant as ( ('y' (test vowel)) or ((not 'y') test(next vowel)) ) define mark_suffix_with_optional_U_vowel as ( (U (test non-vowel)) or ((not U) test(next non-vowel)) ) define mark_possessives as ( among ('m{i}z' 'miz' 'muz' 'm{u"}z' 'n{i}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n') (mark_suffix_with_optional_U_vowel) ) define mark_sU as ( check_vowel_harmony U (mark_suffix_with_optional_s_consonant) ) define mark_lArI as ( among ('leri' 'lar{i}') ) define mark_yU as ( check_vowel_harmony U (mark_suffix_with_optional_y_consonant) ) define mark_nU as ( check_vowel_harmony among ('n{i}' 'ni' 'nu' 'n{u"}') ) define mark_nUn as ( check_vowel_harmony among ('{i}n' 'in' 'un' '{u"}n') (mark_suffix_with_optional_n_consonant) ) define mark_yA as ( check_vowel_harmony among('a' 'e') (mark_suffix_with_optional_y_consonant) ) define mark_nA as ( check_vowel_harmony among('na' 'ne') ) define mark_DA as ( check_vowel_harmony among('da' 'de' 'ta' 'te') ) define mark_ndA as ( check_vowel_harmony among('nda' 'nde') ) define mark_DAn as ( check_vowel_harmony among('dan' 'den' 'tan' 'ten') ) define mark_ndAn as ( check_vowel_harmony among('ndan' 'nden') ) define mark_ylA as ( check_vowel_harmony among('la' 'le') (mark_suffix_with_optional_y_consonant) ) define mark_ki as ( 'ki' ) define mark_ncA as ( check_vowel_harmony among('ca' 'ce') (mark_suffix_with_optional_n_consonant) ) define mark_yUm as ( check_vowel_harmony among ('{i}m' 'im' 'um' '{u"}m') (mark_suffix_with_optional_y_consonant) ) define mark_sUn as ( check_vowel_harmony among ('s{i}n' 'sin' 'sun' 's{u"}n' ) ) define mark_yUz as ( check_vowel_harmony among ('{i}z' 'iz' 'uz' '{u"}z') (mark_suffix_with_optional_y_consonant) ) define mark_sUnUz as ( among ('s{i}n{i}z' 'siniz' 'sunuz' 's{u"}n{u"}z') ) define mark_lAr as ( check_vowel_harmony among ('ler' 'lar') ) define mark_nUz as ( check_vowel_harmony among ('n{i}z' 'niz' 'nuz' 'n{u"}z') ) define mark_DUr as ( check_vowel_harmony among ('t{i}r' 'tir' 'tur' 't{u"}r' 'd{i}r' 'dir' 'dur' 'd{u"}r') ) define mark_cAsInA as ( among ('cas{i}na' 'cesine') ) define mark_yDU as ( check_vowel_harmony among ('t{i}m' 'tim' 'tum' 't{u"}m' 'd{i}m' 'dim' 'dum' 'd{u"}m' 't{i}n' 'tin' 'tun' 't{u"}n' 'd{i}n' 'din' 'dun' 'd{u"}n' 't{i}k' 'tik' 'tuk' 't{u"}k' 'd{i}k' 'dik' 'duk' 'd{u"}k' 't{i}' 'ti' 'tu' 't{u"}' 'd{i}' 'di' 'du' 'd{u"}') (mark_suffix_with_optional_y_consonant) ) // does not fully obey vowel harmony define mark_ysA as ( among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se') (mark_suffix_with_optional_y_consonant) ) define mark_ymUs_ as ( check_vowel_harmony among ('m{i}{sc}' 'mi{sc}' 'mu{sc}' 'm{u"}{sc}') (mark_suffix_with_optional_y_consonant) ) define mark_yken as ( 'ken' (mark_suffix_with_optional_y_consonant) ) define stem_nominal_verb_suffixes as ( [ set continue_stemming_noun_suffixes (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken) or (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_) or ( mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_)) unset continue_stemming_noun_suffixes ) or (mark_nUz (mark_yDU or mark_ysA)) or ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_)) or (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)) ]delete ) // stems noun suffix chains ending with -ki define stem_suffix_chain_before_ki as ( [ mark_ki ( (mark_DA] delete try([ (mark_lAr] delete try(stem_suffix_chain_before_ki)) or (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) )) or (mark_nUn] delete try([ (mark_lArI] delete) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) )) or (mark_ndA ( (mark_lArI] delete) or ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki))) or (stem_suffix_chain_before_ki) )) ) ) define stem_noun_suffixes as ( ([mark_lAr] delete try(stem_suffix_chain_before_ki)) or ([mark_ncA] delete try( ([mark_lArI] delete) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or ([mark_lAr] delete stem_suffix_chain_before_ki) ) ) or ([(mark_ndA or mark_nA) ( (mark_lArI] delete) or (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) ) ) or ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI))) or ( [mark_DAn] delete try ([ ( (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lAr] delete try(stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) )) ) or ([mark_nUn or mark_ylA] delete try( ([mark_lAr] delete stem_suffix_chain_before_ki) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or stem_suffix_chain_before_ki ) ) or ([mark_lArI] delete) or (stem_suffix_chain_before_ki) or ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki)) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) ) define post_process_last_consonants as ( [substring] among ( 'b' (<- 'p') 'c' (<- '{cc}') 'd' (<- 't') '{g~}' (<- 'k') ) ) // after stemming if the word ends with 'd' or 'g' most probably last U is // overstemmed like in 'kedim' -> 'ked' // Turkish words don't usually end with 'd' or 'g' // some very well known words are ignored (like 'ad' 'soyad' // appends U to stems ending with d or g, decides which vowel to add // based on the last vowel in the stem define append_U_to_stems_ending_with_d_or_g as ( [] ('d' or 'g') goto vowel (('a' or '{i}') <- '{i}') or (('e' or 'i') <- 'i') or (('o' or 'u') <- 'u') or (('{o"}' or '{u"}') <- '{u"}') ) define is_reserved_word as ( 'ad' try 'soy' atlimit ) ) define remove_proper_noun_suffix as ( // Remove any leading apostrophes (e.g. from tokenisation of single-quoted // text). do ([goto not '{'}'] delete) // https://en.wikipedia.org/wiki/Turkish_language says "In modern // Turkish orthography, an apostrophe is used to separate proper names // from any suffixes" with the example "Türkiye'dir ("it is Turkey")". // Therefore we truncate at the first apostrophe, provided there are at least // two characters before it (which avoids adversely affecting some foreign // names and words such as "o'connor", "l'entrée"). do ( hop 2 goto '{'}' [ tolimit ] delete ) ) // Test if there is more than one syllable. // In Turkish each vowel indicates a distinct syllable. define more_than_one_syllable_word as ( test (loop 2 gopast vowel) ) define postlude as ( backwards ( not is_reserved_word do append_U_to_stems_ending_with_d_or_g do post_process_last_consonants ) ) define stem as ( do remove_proper_noun_suffix more_than_one_syllable_word backwards ( do stem_nominal_verb_suffixes continue_stemming_noun_suffixes do stem_noun_suffixes ) postlude ) snowball-3.1.0/algorithms/yiddish.sbl000066400000000000000000000410421520373054300176470ustar00rootroot00000000000000/* ******************************************* * Stemmer for Yiddish language in YIVO script * * Author: Assaf Urieli * Emails: assaf.urieli at gmail.com ********************************************* */ routines ( prelude mark_regions R1 R1plus3 standard_suffix ) externals ( stem ) integers ( p1 x ) groupings ( vowel niked alefBeys consonant ) stringescapes {} // AlefBeys stringdef Alef '{U+05D0}' stringdef Beys '{U+05D1}' stringdef Giml '{U+05D2}' stringdef Dalet '{U+05D3}' stringdef Hey '{U+05D4}' stringdef Vov '{U+05D5}' stringdef Zayen '{U+05D6}' stringdef Khes '{U+05D7}' stringdef Tes '{U+05D8}' stringdef Yud '{U+05D9}' stringdef LangerKhof '{U+05DA}' stringdef Khof '{U+05DB}' stringdef Lamed '{U+05DC}' stringdef ShlosMem '{U+05DD}' stringdef Mem '{U+05DE}' stringdef LangerNun '{U+05DF}' stringdef Nun '{U+05E0}' stringdef Samekh '{U+05E1}' stringdef Ayen '{U+05E2}' stringdef LangerFey '{U+05E3}' stringdef Fey '{U+05E4}' stringdef LangerTsadek '{U+05E5}' stringdef Tsadek '{U+05E6}' stringdef Kuf '{U+05E7}' stringdef Reysh '{U+05E8}' stringdef Shin '{U+05E9}' stringdef Sof '{U+05EA}' stringdef TsveyVovn '{U+05F0}' stringdef VovYud '{U+05F1}' stringdef TsveyYudn '{U+05F2}' // Niked stringdef Shvo '{U+05B0}' stringdef Khirik '{U+05B4}' stringdef Tseyre '{U+05B5}' stringdef Segl '{U+05B6}' stringdef ReducedSegl '{U+05B1}' stringdef Pasekh '{U+05B7}' stringdef ReducedPasekh '{U+05B2}' stringdef Komets '{U+05B8}' stringdef ReducedKomets '{U+05B3}' stringdef Rafe '{U+05BF}' stringdef SinDot '{U+05C2}' stringdef ShinDot '{U+05C1}' stringdef Khoylm '{U+05B9}' stringdef Melupm '{U+05BC}' stringdef Kubuts '{U+05BB}' // Groupings define niked '{Shvo}{Khirik}{Tseyre}{Segl}{ReducedSegl}{Pasekh}{ReducedPasekh}{Komets}{ReducedKomets}{SinDot}{ShinDot}{Khoylm}{Melupm}{Kubuts}{Rafe}' define alefBeys '{Alef}{Beys}{Giml}{Dalet}{Hey}{Vov}{Zayen}{Khes}{Tes}{Yud}{LangerKhof}{Khof}{Lamed}{ShlosMem}{Mem}{LangerNun}{Nun}{Samekh}{Ayen}{LangerFey}{Fey}{LangerTsadek}{Tsadek}{Kuf}{Reysh}{Shin}{Sof}{TsveyVovn}{VovYud}{TsveyYudn}' define vowel '{Alef}{Vov}{Yud}{Ayen}{VovYud}{TsveyYudn}' define consonant alefBeys - vowel define prelude as ( do ( repeat goto ( [substring] among ( '{Vov}{Vov}' ( not '{Melupm}' <- '{TsveyVovn}' ) '{Vov}{Yud}' ( not '{Khirik}' <- '{VovYud}' ) '{Yud}{Yud}' ( not '{Khirik}' <- '{TsveyYudn}' ) '{LangerKhof}' ( <- '{Khof}') '{ShlosMem}' ( <- '{Mem}' ) '{LangerNun}' ( <- '{Nun}' ) '{LangerFey}' ( <- '{Fey}' ) '{LangerTsadek}' ( <- '{Tsadek}' ) ) ) ) do (repeat goto ( [niked] delete )) ) define mark_regions as ( $p1 = limit ( try ( // Replace past participle ge- at start of word // Unless word starts with gelt- or gebn- or the whole word is ge ['{Giml}{Ayen}'] not ('{Lamed}{Tes}' or '{Beys}{Nun}' or atlimit) <- 'GE' ) try ( // skip verbal prefix among( // Free stressed: Adurkh-, Durkh-, Ahin-, Aher-, Avek-, Mit-, Antkegn-, Akegn-, Anider-, Arop-, Aroys-, Aroyf-, Arum-, Arayn-, Arunter-, Ariber-, Nokh-, Farbay-, Aheym-, Afir-, Faroys-, Funander-, Tsuzamen-, Tsunoyf-, Tsurik- '{Alef}{Dalet}{Vov}{Reysh}{Khof}' '{Dalet}{Vov}{Reysh}{Khof}' '{Alef}{Hey}{Yud}{Nun}' '{Alef}{Hey}{Ayen}{Reysh}' '{Alef}{TsveyVovn}{Ayen}{Kuf}' '{Mem}{Yud}{Tes}' '{Alef}{Nun}{Tes}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Nun}{Yud}{Dalet}{Ayen}{Reysh}' '{Alef}{Reysh}{Alef}{Fey}' '{Alef}{Reysh}{VovYud}{Samekh}' '{Alef}{Reysh}{VovYud}{Fey}' '{Alef}{Reysh}{Vov}{Mem}' '{Alef}{Reysh}{TsveyYudn}{Nun}' '{Alef}{Reysh}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Reysh}{Yud}{Beys}{Ayen}{Reysh}' '{Nun}{Alef}{Khof}' '{Fey}{Alef}{Reysh}{Beys}{TsveyYudn}' '{Alef}{Hey}{TsveyYudn}{Mem}' '{Alef}{Fey}{Yud}{Reysh}' '{Fey}{Alef}{Reysh}{VovYud}{Samekh}' '{Fey}{Vov}{Nun}{Alef}{Nun}{Dalet}{Ayen}{Reysh}' '{Tsadek}{Vov}{Zayen}{Alef}{Mem}{Ayen}{Nun}' '{Tsadek}{Vov}{Nun}{VovYud}{Fey}' '{Tsadek}{Vov}{Reysh}{Yud}{Kuf}' // Stressed: Oys-, Oyf-, Um-, Unter-, Iber-, Ayn-, On-, Op-, Bay-, For-, Tsu-. '{Alef}{VovYud}{Samekh}' '{Alef}{VovYud}{Fey}' '{Alef}{Vov}{Mem}' '{Alef}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Yud}{Beys}{Ayen}{Reysh}' '{Alef}{TsveyYudn}{Nun}' '{Alef}{Nun}' '{Alef}{Fey}' '{Beys}{TsveyYudn}' '{Fey}{Alef}{Reysh}' '{Tsadek}{Vov}' // Unstressed: Ant-, Ba-, Der-, Tse-. Far- already covered by For-. Ge- comes later. '{Alef}{Nun}{Tes}' '{Beys}{Alef}' '{Dalet}{Ayen}{Reysh}' '{Tsadek}{Ayen}' // If verbal prefix followed by Tsu- or Ge-, replace it ( // Don't mark the TSU- prefix inside verbs like "oys-tsugn" test (('{Tsadek}{Vov}{Giml}{Nun}' or '{Tsadek}{Vov}{Kuf}{Tes}' or '{Tsadek}{Vov}{Kuf}{Nun}') atlimit) or // Don't mark the GE- prefix inside verbs like "avek-gebn" test ('{Giml}{Ayen}{Beys}{Nun}') or ( ['{Giml}{Ayen}'] <- 'GE') or (['{Tsadek}{Vov}'] <- 'TSU') ) ) ) test(hop 3 setmark x) // We want to allow three-consonant Hebrew roots. // To this end, we skip three-consonant combinations that exist in non-Hebraic Yiddish. try ( among( '{Shin}{Fey}{Reysh}' '{Shin}{Tes}{Reysh}' '{Shin}{Tes}{Shin}' '{Dalet}{Zayen}{Shin}' ( true ) ) ) // Either 3 consonants or the first non-vowel after a vowel ( not (consonant consonant consonant setmark p1) gopast vowel goto non-vowel setmark p1 ) try($p1 < x $p1 = x) // at least 3 past the prefix ) ) backwardmode ( define R1 as $p1 <= cursor // Like R1, but also allows the cursor to be outside R1 by the width of Giml Yud Samekh define R1plus3 as $p1 <= cursor + sizeof '{Giml}{Yud}{Samekh}' define standard_suffix as ( do ( [substring] among( // Plural/adjective endings: -er, -ers, -e, -n, -s, -en, -ns, -eners, -ens, -es '{Ayen}{Reysh}{Samekh}' '{Ayen}{Nun}' '{Nun}{Samekh}' '{Ayen}{Nun}{Ayen}{Reysh}{Samekh}' '{Ayen}{Samekh}' '{Ayen}' '{Nun}' '{Samekh}' '{Ayen}{Mem}' '{Ayen}{Reysh}' ( R1 delete ) // Exception: don't delete noun endings -ie, like "agitatsie" '{Yud}{Ayen}' ( true ) // -ies => ie '{Yud}{Ayen}{Samekh}' ( R1 <- '{Yud}{Ayen}' ) // Plural/adjective endings: -enem, -ener, -ene, -ens '{Ayen}{Nun}{Ayen}' '{Ayen}{Nun}{Ayen}{Mem}' '{Ayen}{Nun}{Ayen}{Reysh}' '{Ayen}{Nun}{Samekh}' (R1 delete [substring] among ( // -gegangen => -gey '{Giml}{Alef}{Nun}{Giml}' (<- '{Giml}{TsveyYudn}') // -genumen => -nem '{Nun}{Vov}{Mem}' (<- '{Nun}{Ayen}{Mem}') // -gemiten => -mayd '{Mem}{Yud}{Tes}' (<- '{Mem}{TsveyYudn}{Dalet}') // -gebiten => -bayt '{Beys}{Yud}{Tes}' (<- '{Beys}{TsveyYudn}{Tes}') // -gebisen => -bays '{Beys}{Yud}{Samekh}' (<- '{Beys}{TsveyYudn}{Samekh}') // -gevizen => -vayz '{TsveyVovn}{Yud}{Zayen}' (<- '{TsveyVovn}{TsveyYudn}{Zayen}') // -getriben => -trayb '{Tes}{Reysh}{Yud}{Beys}' (<- '{Tes}{Reysh}{TsveyYudn}{Beys}') // -geliten => -layt '{Lamed}{Yud}{Tes}' (<- '{Lamed}{TsveyYudn}{Tes}') // -gekliben => -klayb '{Kuf}{Lamed}{Yud}{Beys}' (<- '{Kuf}{Lamed}{TsveyYudn}{Beys}') // -geriben => -rayb '{Reysh}{Yud}{Beys}' (<- '{Reysh}{TsveyYudn}{Beys}') // -gerisen => -rays '{Reysh}{Yud}{Samekh}' (<- '{Reysh}{TsveyYudn}{Samekh}') // -geshvigen => -shvayg '{Shin}{TsveyVovn}{Yud}{Giml}' (<- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}') // -geshmisen => -shmays '{Shin}{Mem}{Yud}{Samekh}' (<- '{Shin}{Mem}{TsveyYudn}{Samekh}') // -geshniten => -shnayd '{Shin}{Nun}{Yud}{Tes}' (<- '{Shin}{Nun}{TsveyYudn}{Dalet}') // -geshriben => -shrayb '{Shin}{Reysh}{Yud}{Beys}' (<- '{Shin}{Reysh}{TsveyYudn}{Beys}') // -gebunden => -bind '{Beys}{Vov}{Nun}{Dalet}' (<- '{Beys}{Yud}{Nun}{Dalet}') // -gevuntshn => -vintsh '{TsveyVovn}{Vov}{Tes}{Shin}' (<- '{TsveyVovn}{Yud}{Tes}{Shin}') // -gezungen => -zing '{Zayen}{Vov}{Nun}{Giml}' (<- '{Zayen}{Yud}{Nun}{Giml}') // -getrunken => -trink '{Tes}{Reysh}{Vov}{Nun}{Kuf}' (<- '{Tes}{Reysh}{Yud}{Nun}{Kuf}') // -getsvungen => -tsving '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}' (<- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}') // -geshlungen => -shling '{Shin}{Lamed}{Vov}{Nun}{Giml}' (<- '{Shin}{Lamed}{Yud}{Nun}{Giml}') // -geboygen => -beyg '{Beys}{VovYud}{Giml}' (<- '{Beys}{TsveyYudn}{Giml}') // -gehoyben => -heyb '{Hey}{VovYud}{Beys}' (<- '{Hey}{TsveyYudn}{Beys}') // -farloyren => -farlir '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}' (<- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}') // -shtanen => -shtey '{Shin}{Tes}{Alef}{Nun}' (<- '{Shin}{Tes}{TsveyYudn}') // -geshvoyrn => -shver '{Shin}{TsveyVovn}{VovYud}{Reysh}' (<- '{Shin}{TsveyVovn}{Ayen}{Reysh}') ) ) // Verb/past participle ending: -t '{Tes}' ( R1 delete ) // As well as noun/adjectives ending in -tn, -te, -ter, -ts so that the "-t" doesn't differentiate // Similarly for past participles: -tns, -tene, -tenem, -tener // If the Tes was before R1, we try to perform the same action while leaving the Tes in place '{Tes}{Nun}' '{Tes}{Ayen}' '{Tes}{Ayen}{Reysh}' '{Tes}{Samekh}' '{Tes}{Nun}{Samekh}' '{Tes}{Ayen}{Nun}{Ayen}' '{Tes}{Ayen}{Nun}{Ayen}{Mem}' '{Tes}{Ayen}{Nun}{Ayen}{Reysh}' ( ((R1 delete) or ( <- '{Tes}')) // -(ge)brakht => -breng ['{Beys}{Reysh}{Alef}{Khof}' try '{Giml}{Ayen}'] <- '{Beys}{Reysh}{Ayen}{Nun}{Giml}' ) // Past participles: -et, -etn, -ets, -ete, -eter '{Ayen}{Tes}' '{Ayen}{Tes}{Nun}' '{Ayen}{Tes}{Samekh}' '{Ayen}{Tes}{Ayen}' '{Ayen}{Tes}{Ayen}{Reysh}' ( R1 delete ) // -geyn shorted to -gey '{Giml}{TsveyYudn}{Nun}' ( <- '{Giml}{TsveyYudn}') // ##################### Long list of irregular past participles // -(ge)gangen (shortened to -gangen after prefixes) => -gey '{Giml}{Alef}{Nun}{Giml}{Ayen}{Nun}' ( <- '{Giml}{TsveyYudn}' ) // -(ge)numen (shortened to -numen after prefixes) => -nem '{Nun}{Vov}{Mem}{Ayen}{Nun}' (<- '{Nun}{Ayen}{Mem}' ) // -(ge)shribn (shortened to -shribn after prefixes) => -shrayb '{Shin}{Reysh}{Yud}{Beys}{Nun}' (<- '{Shin}{Reysh}{TsveyYudn}{Beys}' ) // -gemiten => -mayd 'GE{Mem}{Yud}{Tes}{Nun}' (<- '{Mem}{TsveyYudn}{Dalet}') // -gebiten => -bayt 'GE{Beys}{Yud}{Tes}{Nun}' (<- '{Beys}{TsveyYudn}{Tes}') // -gebisen => -bays 'GE{Beys}{Yud}{Samekh}{Nun}' ( <- '{Beys}{TsveyYudn}{Samekh}') // -gevizen => -vayz '{TsveyVovn}{Yud}{Zayen}{Nun}' ( <- '{TsveyVovn}{TsveyYudn}{Zayen}') // -getriben => -trayb '{Tes}{Reysh}{Yud}{Beys}{Nun}' ( <- '{Tes}{Reysh}{TsveyYudn}{Beys}') // -geliten => -layt 'GE{Lamed}{Yud}{Tes}{Nun}' ( <- '{Lamed}{TsveyYudn}{Tes}') // -gekliben => -klayb '{Kuf}{Lamed}{Yud}{Beys}{Nun}' ( <- '{Kuf}{Lamed}{TsveyYudn}{Beys}') // -geriben => -rayb '{Reysh}{Yud}{Beys}{Nun}' ( <- '{Reysh}{TsveyYudn}{Beys}') // -gerisen => -rays 'GE{Reysh}{Yud}{Samekh}{Nun}' ( <- '{Reysh}{TsveyYudn}{Samekh}') // -geshvigen => -shvayg '{Shin}{TsveyVovn}{Yud}{Giml}{Nun}' ( <- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}') // -geshmisen => -shmays '{Shin}{Mem}{Yud}{Samekh}{Nun}' ( <- '{Shin}{Mem}{TsveyYudn}{Samekh}') // -geshniten => -shnayd '{Shin}{Nun}{Yud}{Tes}{Nun}' ( <- '{Shin}{Nun}{TsveyYudn}{Dalet}') // -gebunden => -bind '{Beys}{Vov}{Nun}{Dalet}{Nun}' ( <- '{Beys}{Yud}{Nun}{Dalet}') // -gevuntshn => -vintsh '{TsveyVovn}{Vov}{Tes}{Shin}{Nun}' ( <- '{TsveyVovn}{Yud}{Tes}{Shin}') // -gezungen => -zing '{Zayen}{Vov}{Nun}{Giml}{Nun}' ( <- '{Zayen}{Yud}{Nun}{Giml}') // -getrunken => -trink '{Tes}{Reysh}{Vov}{Nun}{Kuf}{Nun}' ( <- '{Tes}{Reysh}{Yud}{Nun}{Kuf}') // -getsvungen => -tsving '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}{Nun}' ( <- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}') // -geshlungen => -shling '{Shin}{Lamed}{Vov}{Nun}{Giml}{Nun}' ( <- '{Shin}{Lamed}{Yud}{Nun}{Giml}') // -geboygen => -beyg '{Beys}{VovYud}{Giml}{Nun}' ( <- '{Beys}{TsveyYudn}{Giml}') // -gehoyben => -heyb '{Hey}{VovYud}{Beys}{Nun}' ( <- '{Hey}{TsveyYudn}{Beys}') // -farloyren => -farlir '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}{Nun}' ( <- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}') // -shtanen => -shtey '{Shin}{Tes}{Alef}{Nun}{Ayen}{Nun}' ( <- '{Shin}{Tes}{TsveyYudn}') // -geshvoyrn => -shver '{Shin}{TsveyVovn}{VovYud}{Reysh}{Nun}' ( <- '{Shin}{TsveyVovn}{Ayen}{Reysh}') // -(ge)brakht (shortened to -brakht after prefixes) => -breng '{Beys}{Reysh}{Alef}{Khof}{Tes}' (<- '{Beys}{Reysh}{Ayen}{Nun}{Giml}' ) // ###### End of irregular past participles // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}' ( R1 delete ) // Noun endings: -izm, izmen '{Yud}{Zayen}{Mem}' '{Yud}{Zayen}{Mem}{Ayen}{Nun}' ( R1 delete ) // Plural ending: -im '{Yud}{Mem}' ( R1 delete ) // Plural ending: -os (Hebraic), replace with -h '{Vov}{Sof}' ( R1 <- '{Hey}' ) // Diminutive endings: -elekh, -ele, -lekh, -eles, -elen '{Ayen}{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}' '{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}{Samekh}' '{Ayen}{Lamed}{Ayen}{Nun}' ( R1 delete ) // Noun ending: -ist '{Yud}{Samekh}{Tes}' ( // Exceptions: -gist, -shist ( ('{Giml}' or '{Shin}') try (R1plus3 <- '{Yud}{Samekh}') ) or ( R1 delete ) ) // Noun ending: -istn '{Yud}{Samekh}{Tes}{Nun}' ( R1 delete ) // Verb ending: -stu '{Samekh}{Tes}{Vov}' ( R1 delete ) // Superlative ending: -ster, -ste, -stn '{Samekh}{Tes}{Ayen}{Reysh}' '{Samekh}{Tes}{Ayen}' '{Samekh}{Tes}{Nun}' ( R1 delete ) // Ambiguous verb ending: -st '{Samekh}{Tes}' ( R1 delete ) ) ) do ( [substring] among( // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}' ( R1 delete ) // Diminutive endings: -l '{Lamed}' ( R1 consonant delete ) ) ) do ( [substring] among( // Adjective endings: -ig, -ik, -ish, -nik, -dik '{Yud}{Giml}' '{Yud}{Kuf}' '{Yud}{Shin}' '{Nun}{Yud}{Kuf}' '{Dalet}{Yud}{Kuf}' ( R1 delete ) // Exceptions to above: -blik, -glik '{Beys}{Lamed}{Yud}{Kuf}' '{Giml}{Lamed}{Yud}{Kuf}' ( true ) // Present participle endings: -ndik '{Nun}{Dalet}{Yud}{Kuf}' ( R1 delete ) // Present participle ending -endik: delete if after a -ng, -nk, -n, -m, consonant+l, or vowel. // Otherwise, delete just the -ndik part. '{Ayen}{Nun}{Dalet}{Yud}{Kuf}' ( R1 delete ) ) ) do (repeat goto ( ['GE' or 'TSU'] delete )) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix ) snowball-3.1.0/charsets/000077500000000000000000000000001520373054300151525ustar00rootroot00000000000000snowball-3.1.0/charsets/ISO-8859-2.sbl000066400000000000000000000051031520373054300170570ustar00rootroot00000000000000// ISO-8859-2 character mappings. stringdef U+00A0 hex 'A0' stringdef U+0104 hex 'A1' stringdef U+02D8 hex 'A2' stringdef U+0141 hex 'A3' stringdef U+00A4 hex 'A4' stringdef U+013D hex 'A5' stringdef U+015A hex 'A6' stringdef U+00A7 hex 'A7' stringdef U+00A8 hex 'A8' stringdef U+0160 hex 'A9' stringdef U+015E hex 'AA' stringdef U+0164 hex 'AB' stringdef U+0179 hex 'AC' stringdef U+00AD hex 'AD' stringdef U+017D hex 'AE' stringdef U+017B hex 'AF' stringdef U+00B0 hex 'B0' stringdef U+0105 hex 'B1' stringdef U+02DB hex 'B2' stringdef U+0142 hex 'B3' stringdef U+00B4 hex 'B4' stringdef U+013E hex 'B5' stringdef U+015B hex 'B6' stringdef U+02C7 hex 'B7' stringdef U+00B8 hex 'B8' stringdef U+0161 hex 'B9' stringdef U+015F hex 'BA' stringdef U+0165 hex 'BB' stringdef U+017A hex 'BC' stringdef U+02DD hex 'BD' stringdef U+017E hex 'BE' stringdef U+017C hex 'BF' stringdef U+0154 hex 'C0' stringdef U+00C1 hex 'C1' stringdef U+00C2 hex 'C2' stringdef U+0102 hex 'C3' stringdef U+00C4 hex 'C4' stringdef U+0139 hex 'C5' stringdef U+0106 hex 'C6' stringdef U+00C7 hex 'C7' stringdef U+010C hex 'C8' stringdef U+00C9 hex 'C9' stringdef U+0118 hex 'CA' stringdef U+00CB hex 'CB' stringdef U+011A hex 'CC' stringdef U+00CD hex 'CD' stringdef U+00CE hex 'CE' stringdef U+010E hex 'CF' stringdef U+0110 hex 'D0' stringdef U+0143 hex 'D1' stringdef U+0147 hex 'D2' stringdef U+00D3 hex 'D3' stringdef U+00D4 hex 'D4' stringdef U+0150 hex 'D5' stringdef U+00D6 hex 'D6' stringdef U+00D7 hex 'D7' stringdef U+0158 hex 'D8' stringdef U+016E hex 'D9' stringdef U+00DA hex 'DA' stringdef U+0170 hex 'DB' stringdef U+00DC hex 'DC' stringdef U+00DD hex 'DD' stringdef U+0162 hex 'DE' stringdef U+00DF hex 'DF' stringdef U+0155 hex 'E0' stringdef U+00E1 hex 'E1' stringdef U+00E2 hex 'E2' stringdef U+0103 hex 'E3' stringdef U+00E4 hex 'E4' stringdef U+013A hex 'E5' stringdef U+0107 hex 'E6' stringdef U+00E7 hex 'E7' stringdef U+010D hex 'E8' stringdef U+00E9 hex 'E9' stringdef U+0119 hex 'EA' stringdef U+00EB hex 'EB' stringdef U+011B hex 'EC' stringdef U+00ED hex 'ED' stringdef U+00EE hex 'EE' stringdef U+010F hex 'EF' stringdef U+0111 hex 'F0' stringdef U+0144 hex 'F1' stringdef U+0148 hex 'F2' stringdef U+00F3 hex 'F3' stringdef U+00F4 hex 'F4' stringdef U+0151 hex 'F5' stringdef U+00F6 hex 'F6' stringdef U+00F7 hex 'F7' stringdef U+0159 hex 'F8' stringdef U+016F hex 'F9' stringdef U+00FA hex 'FA' stringdef U+0171 hex 'FB' stringdef U+00FC hex 'FC' stringdef U+00FD hex 'FD' stringdef U+0163 hex 'FE' stringdef U+02D9 hex 'FF' snowball-3.1.0/charsets/KOI8-R.sbl000066400000000000000000000036671520373054300166010ustar00rootroot00000000000000// KOI8-R character mappings. stringdef U+00A0 hex '9A' stringdef U+00A9 hex 'BF' stringdef U+00B0 hex '9C' stringdef U+00B2 hex '9D' stringdef U+00B7 hex '9E' stringdef U+00F7 hex '9F' stringdef U+0401 hex 'B3' stringdef U+0410 hex 'E1' stringdef U+0411 hex 'E2' stringdef U+0412 hex 'F7' stringdef U+0413 hex 'E7' stringdef U+0414 hex 'E4' stringdef U+0415 hex 'E5' stringdef U+0416 hex 'F6' stringdef U+0417 hex 'FA' stringdef U+0418 hex 'E9' stringdef U+0419 hex 'EA' stringdef U+041A hex 'EB' stringdef U+041B hex 'EC' stringdef U+041C hex 'ED' stringdef U+041D hex 'EE' stringdef U+041E hex 'EF' stringdef U+041F hex 'F0' stringdef U+0420 hex 'F2' stringdef U+0421 hex 'F3' stringdef U+0422 hex 'F4' stringdef U+0423 hex 'F5' stringdef U+0424 hex 'E6' stringdef U+0425 hex 'E8' stringdef U+0426 hex 'E3' stringdef U+0427 hex 'FE' stringdef U+0428 hex 'FB' stringdef U+0429 hex 'FD' stringdef U+042A hex 'FF' stringdef U+042B hex 'F9' stringdef U+042C hex 'F8' stringdef U+042D hex 'FC' stringdef U+042E hex 'E0' stringdef U+042F hex 'F1' stringdef U+0430 hex 'C1' stringdef U+0431 hex 'C2' stringdef U+0432 hex 'D7' stringdef U+0433 hex 'C7' stringdef U+0434 hex 'C4' stringdef U+0435 hex 'C5' stringdef U+0436 hex 'D6' stringdef U+0437 hex 'DA' stringdef U+0438 hex 'C9' stringdef U+0439 hex 'CA' stringdef U+043A hex 'CB' stringdef U+043B hex 'CC' stringdef U+043C hex 'CD' stringdef U+043D hex 'CE' stringdef U+043E hex 'CF' stringdef U+043F hex 'D0' stringdef U+0440 hex 'D2' stringdef U+0441 hex 'D3' stringdef U+0442 hex 'D4' stringdef U+0443 hex 'D5' stringdef U+0444 hex 'C6' stringdef U+0445 hex 'C8' stringdef U+0446 hex 'C3' stringdef U+0447 hex 'DE' stringdef U+0448 hex 'DB' stringdef U+0449 hex 'DD' stringdef U+044A hex 'DF' stringdef U+044B hex 'D9' stringdef U+044C hex 'D8' stringdef U+044D hex 'DC' stringdef U+044E hex 'C0' stringdef U+044F hex 'D1' stringdef U+0451 hex 'A3' snowball-3.1.0/charsets/cp850.sbl000066400000000000000000000066661520373054300165310ustar00rootroot00000000000000// Code page 850 (MSDOS Latin 1) character mappings. stringdef U+00A0 hex 'FF' stringdef U+00A1 hex 'AD' stringdef U+00A2 hex 'BD' stringdef U+00A3 hex '9C' stringdef U+00A4 hex 'CF' stringdef U+00A5 hex 'BE' stringdef U+00A6 hex 'DD' stringdef U+00A7 hex 'F5' stringdef U+00A8 hex 'F9' stringdef U+00A9 hex 'B8' stringdef U+00AA hex 'A6' stringdef U+00AB hex 'AE' stringdef U+00AC hex 'AA' stringdef U+00AD hex 'F0' stringdef U+00AE hex 'A9' stringdef U+00AF hex 'EE' stringdef U+00B0 hex 'F8' stringdef U+00B1 hex 'F1' stringdef U+00B2 hex 'FD' stringdef U+00B3 hex 'FC' stringdef U+00B4 hex 'EF' stringdef U+00B5 hex 'E6' stringdef U+00B6 hex 'F4' stringdef U+00B7 hex 'FA' stringdef U+00B8 hex 'F7' stringdef U+00B9 hex 'FB' stringdef U+00BA hex 'A7' stringdef U+00BB hex 'AF' stringdef U+00BC hex 'AC' stringdef U+00BD hex 'AB' stringdef U+00BE hex 'F3' stringdef U+00BF hex 'A8' stringdef U+00C0 hex 'B7' stringdef U+00C1 hex 'B5' stringdef U+00C2 hex 'B6' stringdef U+00C3 hex 'C7' stringdef U+00C4 hex '8E' stringdef U+00C5 hex '8F' stringdef U+00C6 hex '92' stringdef U+00C7 hex '80' stringdef U+00C8 hex 'D4' stringdef U+00C9 hex '90' stringdef U+00CA hex 'D2' stringdef U+00CB hex 'D3' stringdef U+00CC hex 'DE' stringdef U+00CD hex 'D6' stringdef U+00CE hex 'D7' stringdef U+00CF hex 'D8' stringdef U+00D0 hex 'D1' stringdef U+00D1 hex 'A5' stringdef U+00D2 hex 'E3' stringdef U+00D3 hex 'E0' stringdef U+00D4 hex 'E2' stringdef U+00D5 hex 'E5' stringdef U+00D6 hex '99' stringdef U+00D7 hex '9E' stringdef U+00D8 hex '9D' stringdef U+00D9 hex 'EB' stringdef U+00DA hex 'E9' stringdef U+00DB hex 'EA' stringdef U+00DC hex '9A' stringdef U+00DD hex 'ED' stringdef U+00DE hex 'E8' stringdef U+00DF hex 'E1' stringdef U+00E0 hex '85' stringdef U+00E1 hex 'A0' stringdef U+00E2 hex '83' stringdef U+00E3 hex 'C6' stringdef U+00E4 hex '84' stringdef U+00E5 hex '86' stringdef U+00E6 hex '91' stringdef U+00E7 hex '87' stringdef U+00E8 hex '8A' stringdef U+00E9 hex '82' stringdef U+00EA hex '88' stringdef U+00EB hex '89' stringdef U+00EC hex '8D' stringdef U+00ED hex 'A1' stringdef U+00EE hex '8C' stringdef U+00EF hex '8B' stringdef U+00F0 hex 'D0' stringdef U+00F1 hex 'A4' stringdef U+00F2 hex '95' stringdef U+00F3 hex 'A2' stringdef U+00F4 hex '93' stringdef U+00F5 hex 'E4' stringdef U+00F6 hex '94' stringdef U+00F7 hex 'F6' stringdef U+00F8 hex '9B' stringdef U+00F9 hex '97' stringdef U+00FA hex 'A3' stringdef U+00FB hex '96' stringdef U+00FC hex '81' stringdef U+00FD hex 'EC' stringdef U+00FE hex 'E7' stringdef U+00FF hex '98' stringdef U+0131 hex 'D5' stringdef U+0192 hex '9F' stringdef U+2017 hex 'F2' stringdef U+2500 hex 'C4' stringdef U+2502 hex 'B3' stringdef U+250C hex 'DA' stringdef U+2510 hex 'BF' stringdef U+2514 hex 'C0' stringdef U+2518 hex 'D9' stringdef U+251C hex 'C3' stringdef U+2524 hex 'B4' stringdef U+252C hex 'C2' stringdef U+2534 hex 'C1' stringdef U+253C hex 'C5' stringdef U+2550 hex 'CD' stringdef U+2551 hex 'BA' stringdef U+2554 hex 'C9' stringdef U+2557 hex 'BB' stringdef U+255A hex 'C8' stringdef U+255D hex 'BC' stringdef U+2560 hex 'CC' stringdef U+2563 hex 'B9' stringdef U+2566 hex 'CB' stringdef U+2569 hex 'CA' stringdef U+256C hex 'CE' stringdef U+2580 hex 'DF' stringdef U+2584 hex 'DC' stringdef U+2588 hex 'DB' stringdef U+2591 hex 'B0' stringdef U+2592 hex 'B1' stringdef U+2593 hex 'B2' stringdef U+25A0 hex 'FE' snowball-3.1.0/compiler/000077500000000000000000000000001520373054300151505ustar00rootroot00000000000000snowball-3.1.0/compiler/analyser.c000066400000000000000000003147711520373054300171470ustar00rootroot00000000000000#include #include #include /* for INT_MAX */ #include /* printf etc */ #include /* exit */ #include /* memmove */ #include "header.h" /* recursive usage: */ static void read_program_(struct analyser * a, int terminator); static struct node * read_C(struct analyser * a); static struct node * new_string_command(struct analyser * a, int token); static void print_node_(const struct node * p, int n, const char * s) { printf("%*s%s", n * 2, s, name_of_token(p->type)); if (p->name) { putchar(' '); report_s(stdout, p->name->s); } if (p->literalstring) { printf(" '"); report_b(stdout, p->literalstring); printf("'"); } else if (p->type == c_number) { printf(" %d", p->number); } printf("\n"); if (p->AE) print_node_(p->AE, n+1, "# "); if (p->left) print_node_(p->left, n+1, ""); if (p->aux) print_node_(p->aux, n+1, "@ "); if (p->right) print_node_(p->right, n, ""); } extern void print_program(struct analyser * a) { if (a->program) print_node_(a->program, 0, ""); } static struct node * new_node_at_line(struct analyser * a, int type, int line) { NEW(node, p); *p = (struct node){0}; p->mode = a->mode; p->line_number = line; p->type = type; p->next = a->nodes; a->nodes = p; return p; } static struct node * new_node(struct analyser * a, int type) { return new_node_at_line(a, type, a->tokeniser->line_number); } static const char * name_of_mode(int n) { switch (n) { case m_backward: return "string backward"; case m_forward: return "string forward"; } fprintf(stderr, "Invalid mode %d in name_of_mode()\n", n); exit(1); } static const char * name_of_type(int code) { switch (code) { case t_string: return "string"; case t_boolean: return "boolean"; case t_integer: return "integer"; case t_routine: return "routine"; case t_external: return "external"; case t_grouping: return "grouping"; } fprintf(stderr, "Invalid type code %d in name_of_type()\n", code); exit(1); } static void count_error(struct analyser * a) { struct tokeniser * t = a->tokeniser; if (t->error_count >= 20) { fprintf(stderr, "... etc\n"); exit(1); } t->error_count++; } static void report_error_location_line(struct analyser * a, int line) { struct tokeniser * t = a->tokeniser; count_error(a); fprintf(stderr, "%s:%d: ", t->file, line); } static void report_error_location(struct analyser * a) { struct tokeniser * t = a->tokeniser; report_error_location_line(a, t->line_number); } static void report_error_after(struct analyser * a) { struct tokeniser * t = a->tokeniser; if (t->previous_token > 0) fprintf(stderr, " after %s", name_of_token(t->previous_token)); } static void omission_error(struct analyser * a, int n) { report_error_location(a); fprintf(stderr, "%s omitted", name_of_token(n)); report_error_after(a); putc('\n', stderr); } static void unexpected_token_error(struct analyser * a, const char * context) { struct tokeniser * t = a->tokeniser; if (t->token_reported_as_unexpected) { // Avoid duplicate errors if this token was already reported as // unexpected and then held. return; } report_error_location(a); t->token_reported_as_unexpected = true; fprintf(stderr, "unexpected %s", name_of_token(t->token)); if (t->token == c_number) fprintf(stderr, " %d", t->number); if (t->token == c_name) { fprintf(stderr, " %.*s", SIZE(t->s), t->s); } if (context) { fprintf(stderr, " in %s", context); } report_error_after(a); putc('\n', stderr); // If the token is `)` then always hold it as the actual problem is almost // certainly another token missing before it. if (t->token == c_ket) hold_token(t); } static void substring_without_among_error(struct analyser * a) { count_error(a); fprintf(stderr, "%s:%d: 'substring' with no matching 'among'\n", a->tokeniser->file, a->substring->line_number); } static int check_token(struct analyser * a, int code) { struct tokeniser * t = a->tokeniser; if (t->token != code) { omission_error(a, code); hold_token(t); return false; } return true; } static void hold_token_if_toplevel(struct tokeniser * t) { // Hold token if it starts a top-level construct. switch (t->token) { case c_backwardmode: case c_booleans: case c_define: case c_externals: case c_groupings: case c_integers: case c_routines: case c_strings: hold_token(t); } } static int get_token(struct analyser * a, int code) { struct tokeniser * t = a->tokeniser; read_token(t); return check_token(a, code); } static struct name * look_for_name(struct analyser * a) { const byte * q = a->tokeniser->s; for (struct name * p = a->names; p; p = p->next) { byte * b = p->s; int n = SIZE(b); if (n == SIZE(q) && memcmp(q, b, n) == 0) { ++p->references; return p; } } return NULL; } static struct name * find_name(struct analyser * a) { struct name * p = look_for_name(a); if (p == NULL) { report_error_location(a); byte * s = a->tokeniser->s; fprintf(stderr, "'%.*s' undeclared\n", SIZE(s), s); } return p; } static void check_routine_mode(struct analyser * a, struct name * p, int mode) { if (p->mode == m_unknown) { p->mode = mode; } else if (p->mode != mode) { report_error_location(a); fprintf(stderr, "%s '%.*s' mis-used in %s mode\n", name_of_type(p->type), SIZE(p->s), p->s, name_of_mode(mode)); } } static int check_name_type(struct analyser * a, struct name * p, int type) { if (p->type == type) return true; if (type == t_routine && p->type == t_external) return true; report_error_location(a); fprintf(stderr, "'%.*s' not of type %s\n", SIZE(p->s), p->s, type == t_routine ? "routine or external" : name_of_type(type)); return false; } static void read_names(struct analyser * a, int type) { struct tokeniser * t = a->tokeniser; if (!get_token(a, c_bra)) return; while (true) { int token = read_token(t); switch (token) { case c_len: { /* Context-sensitive token - once declared as a name, it loses * its special meaning, for compatibility with older versions * of snowball. */ SET_SIZE(t->s, 0); t->s = add_literal_to_s(t->s, "len"); goto handle_as_name; } case c_lenof: { /* Context-sensitive token - once declared as a name, it loses * its special meaning, for compatibility with older versions * of snowball. */ SET_SIZE(t->s, 0); t->s = add_literal_to_s(t->s, "lenof"); goto handle_as_name; } case c_name: handle_as_name: if (token != c_name) { disable_token(t, token); } if (look_for_name(a) != NULL) { report_error_location(a); fprintf(stderr, "'%.*s' re-declared\n", SIZE(t->s), t->s); } else { NEW(name, p); *p = (struct name){0}; p->mode = m_unknown; /* used for routines, externals */ p->s = copy_s(t->s); p->type = type; /* Delay assigning counts until after we've eliminated * variables whose values are never used and checked for * variables which can be localised. */ p->count = -1; p->declaration_line_number = t->line_number; // Check if any existing names of the same type differ // only by case - if we find one we set a flag so we know // to mangle this name for languages with case-insensitive // identifiers. (Note that the first declared name of any // group of colliding names collision doesn't get this flag // set so won't get mangled.) for (struct name * q = a->names; q; q = q->next) { if (q->type != type) continue; byte * b = q->s; int n = SIZE(b); if (n != SIZE(p->s)) continue; for (int i = 0; i < n; ++i) { if (tolower(p->s[i]) != tolower(b[i])) goto next_name; } p->case_collision = true; goto done_case_check; next_name: ; } done_case_check: p->next = a->names; a->names = p; } break; default: check_token(a, c_ket); return; } } } static symbol * new_literalstring(struct analyser * a) { NEW(literalstring, p); p->b = copy_b(a->tokeniser->b); p->next = a->literalstrings; a->literalstrings = p; return p->b; } static int read_AE_test(struct analyser * a) { struct tokeniser * t = a->tokeniser; switch (read_token(t)) { case c_assign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: return t->token; default: unexpected_token_error(a, "integer test expression"); hold_token(t); return c_eq; } } static int binding(int t) { switch (t) { case c_plus: case c_minus: return 1; case c_multiply: case c_divide: return 2; default: return -2; } } static void mark_used_in(struct analyser * a, struct name * q, struct node * p) { if (!q->used) { q->used = p; q->local_to = a->program_end->name; } else if (q->local_to) { if (q->local_to != a->program_end->name) { /* Used in more than one routine/external. */ q->local_to = NULL; } } } static void name_to_node(struct analyser * a, struct node * p, int type) { struct name * q = find_name(a); if (q) { check_name_type(a, q, type); mark_used_in(a, q, p); } p->name = q; } static struct node * read_AE(struct analyser * a, struct name * assigned_to, int B) { struct tokeniser * t = a->tokeniser; struct node * p; struct node * q; switch (read_token(t)) { case c_minus: { /* monadic */ // Note current line number so c_neg node reports the right line. int neg_line = a->tokeniser->line_number; q = read_AE(a, assigned_to, 100); if (q->type == c_neg) { /* Optimise away double negation, which avoids generators * having to worry about generating "--" (decrement operator * in many languages). */ p = q->right; /* Don't free q, it's in the linked list a->nodes. */ break; } if (q->type == c_number) { /* Negated constant. */ q->number = -q->number; p = q; break; } p = new_node_at_line(a, c_neg, neg_line); p->right = q; break; } case c_bra: p = read_AE(a, assigned_to, 0); get_token(a, c_ket); break; case c_name: p = new_node(a, c_name); name_to_node(a, p, t_integer); if (p->name) { // $x = x + 1 shouldn't count as a use of x. p->name->value_used = (p->name != assigned_to); } break; case c_maxint: case c_minint: a->int_limits_used = true; /* fall through */ case c_cursor: case c_limit: case c_len: case c_size: p = new_node(a, t->token); break; case c_number: p = new_node(a, c_number); p->number = t->number; p->fixed_constant = true; break; case c_lenof: case c_sizeof: { int token = t->token; p = new_string_command(a, token); if (!p->literalstring) { if (p->name) p->name->value_used = true; break; } /* Replace lenof or sizeof on a literal string with a numeric * constant. */ int result = 0; if (token == c_lenof && t->encoding == ENC_UTF8) { // UTF-8. symbol * b = p->literalstring; int dummy; for (int i = 0; i < SIZE(b); i += get_utf8(b + i, &dummy)) { ++result; } } else { result = SIZE(p->literalstring); } p->type = c_number; p->literalstring = NULL; p->number = result; p->fixed_constant = (token == c_lenof); break; } default: unexpected_token_error(a, "integer expression"); hold_token(t); return NULL; } while (true) { int token = read_token(t); int op_line = t->line_number; int b = binding(token); if (binding(token) <= B) { hold_token(t); return p; } struct node * r = read_AE(a, assigned_to, b); if (p->type == c_number && r->type == c_number && // Can't evaluate division by zero. !(token == c_divide && r->number == 0)) { // Evaluate constant sub-expression. q = p; switch (token) { case c_plus: q->number += r->number; break; case c_minus: q->number -= r->number; break; case c_multiply: q->number *= r->number; break; case c_divide: q->number /= r->number; break; default: fprintf(stderr, "Unexpected AE operator %s\n", name_of_token(token)); exit(1); } q->fixed_constant = q->fixed_constant && r->fixed_constant; } else { // Check for specific constant or no-op cases. q = NULL; switch (token) { case c_plus: // 0 + r is r if (p->type == c_number && p->number == 0) { q = r; break; } // p + 0 is p if (r->type == c_number && r->number == 0) { q = p; break; } break; case c_minus: // 0 - r is -r if (p->type == c_number && p->number == 0) { q = new_node_at_line(a, c_neg, op_line); q->right = r; break; } // p - 0 is p if (r->type == c_number && r->number == 0) { q = p; break; } break; case c_multiply: // 0 * r is 0 if (p->type == c_number && p->number == 0) { q = p; break; } // p * 0 is 0 if (r->type == c_number && r->number == 0) { q = r; break; } // -1 * r is -r if (p->type == c_number && p->number == -1) { q = new_node_at_line(a, c_neg, p->line_number); q->right = r; break; } // p * -1 is -p if (r->type == c_number && r->number == -1) { q = new_node_at_line(a, c_neg, r->line_number); q->right = p; break; } // 1 * r is r if (p->type == c_number && p->number == 1) { q = r; break; } // p * 1 is p if (r->type == c_number && r->number == 1) { q = p; break; } break; case c_divide: // p / 1 is p if (r->type == c_number && r->number == 1) { q = p; break; } // p / -1 is -p if (r->type == c_number && r->number == -1) { q = new_node_at_line(a, c_neg, r->line_number); q->right = p; break; } // p / 0 is an error! if (r->type == c_number && r->number == 0) { report_error_location_line(a, op_line); fprintf(stderr, "Division by zero\n"); } break; } if (!q) { q = new_node_at_line(a, token, op_line); q->left = p; q->right = r; } } p = q; } } static int is_just_false(struct node * q) { if (!q) return 1; if (q->type == c_false) return 1; if (q->type != c_bra) return 0; return is_just_false(q->left); } static struct node * read_or(struct analyser * a, struct node * n) { struct tokeniser * t = a->tokeniser; // Note current line number so c_or node reports the right line. int or_line = t->line_number; struct node * p = is_just_false(n) ? NULL : n; struct node * p_end = p; do { struct node * q = read_C(a); // Discard `false` nodes in an `or` chain. if (!is_just_false(q)) { if (p_end) { p_end->right = q; } else { p = q; } p_end = q; } } while (read_token(t) == c_or); hold_token(t); if (p == NULL) { // All sub-nodes are `false` so return the first. return n; } else if (p->right == NULL) { return p; } n = new_node_at_line(a, c_or, or_line); n->left = p; return n; } static int is_just_true(struct node * q) { if (!q) return 1; if (q->type != c_bra && q->type != c_true) return 0; return is_just_true(q->left) && is_just_true(q->right); } static struct node * read_and(struct analyser * a, struct node * n) { struct tokeniser * t = a->tokeniser; // Note current line number so c_and node reports the right line. int and_line = t->line_number; struct node * p = is_just_true(n) ? NULL : n; struct node * p_end = p; do { struct node * q = read_C(a); // Discard nodes equivalent to `true` in an `and` chain. if (!is_just_true(q)) { if (p_end) { p_end->right = q; } else { p = q; } p_end = q; } } while (read_token(t) == c_and); hold_token(t); if (p == NULL) { // Note: is_just_true(n). return n; } else if (p->right == NULL) { return p; } n = new_node_at_line(a, c_and, and_line); n->left = p; return n; } static struct node * read_C_list(struct analyser * a) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, c_bra); struct node * p_end = NULL; while (true) { int token = read_token(t); if (token == c_ket) return p; if (token < 0) { omission_error(a, c_ket); return p; } hold_token(t); struct node * q = read_C(a); while (true) { token = read_token(t); if (token == c_or) { q = read_or(a, q); } else if (token == c_and) { q = read_and(a, q); } else { hold_token(t); break; } } if (p_end == NULL) p->left = q; else p_end->right = q; p_end = q; } } static struct node * new_string_command(struct analyser * a, int token) { struct node * p = new_node(a, token); int str_token = read_token(a->tokeniser); if (str_token == c_literalstring) { p->literalstring = new_literalstring(a); } else if (str_token == c_name) { name_to_node(a, p, t_string); } else { report_error_location(a); fprintf(stderr, "string omitted"); report_error_after(a); putc('\n', stderr); hold_token(a->tokeniser); } return p; } static struct node * read_literalstring(struct analyser * a) { struct node * p = new_node(a, c_literalstring); p->literalstring = new_literalstring(a); return p; } static void reverse_b(symbol * b) { int i = 0; int j = SIZE(b) - 1; while (i < j) { int ch1 = b[i]; int ch2 = b[j]; b[i++] = ch2; b[j--] = ch1; } } static int compare_amongvec(const void *pv, const void *qv) { const struct amongvec * p = (const struct amongvec*)pv; const struct amongvec * q = (const struct amongvec*)qv; symbol * b_p = p->b; int p_size = p->size; symbol * b_q = q->b; int q_size = q->size; int smaller_size = p_size < q_size ? p_size : q_size; for (int i = 0; i < smaller_size; i++) if (b_p[i] != b_q[i]) return b_p[i] - b_q[i]; if (p_size - q_size) return p_size - q_size; return p->string_index - q->string_index; } #define nodes_equivalent(P, Q) \ ((P) == (Q) || ((P) && (Q) && nodes_equivalent_((P), (Q)))) static int nodes_equivalent_(const struct node *p, const struct node *q) { if (p == q) return true; if (p == NULL || q == NULL) return false; if (p->type != q->type) return false; if (p->mode != q->mode) return false; if (p->type == c_number) { if (p->number != q->number) return false; } if (!nodes_equivalent(p->left, q->left)) return false; if (!nodes_equivalent(p->AE, q->AE)) return false; if (!nodes_equivalent(p->aux, q->aux)) return false; if (p->name != q->name) return false; if (p->literalstring != q->literalstring) { if (!p->literalstring || !q->literalstring || SIZE(p->literalstring) != SIZE(q->literalstring) || memcmp(p->literalstring, q->literalstring, SIZE(p->literalstring) * sizeof(symbol)) != 0) { return false; } } return nodes_equivalent(p->right, q->right); } static struct node * make_among(struct analyser * a, struct node * p, struct node * substring) { NEW(among, x); NEWVEC(amongvec, v, p->number); struct node * q = p->left; struct node * starter = NULL; struct amongvec * w0 = v; struct amongvec * w1 = v; int result = 1; int direction = substring != NULL ? substring->mode : p->mode; int backward = direction == m_backward; *x = (struct among){0}; x->node = p; x->b = v; x->shortest_size = INT_MAX; x->in_routine = a->current_routine; if (q->type == c_bra) { fprintf(stderr, "%s:%d: warning: among starter is a legacy feature - put " "starter code between `substring` and `among` instead\n", a->tokeniser->file, q->line_number); starter = q; p->left = q = q->right; } int string_index = 0; while (q) { if (q->type == c_literalstring) { symbol * b = q->literalstring; w1->b = b; /* pointer to case string */ w1->action = NULL; /* action gets filled in later */ w1->line_number = q->line_number; w1->size = SIZE(b); /* number of characters in string */ w1->i = -1; /* index of longest substring */ w1->result = -1; /* number of corresponding case expression */ w1->string_index = string_index++; if (q->left) { struct name * function = q->left->name; w1->function = function; ++function->used_in_among; check_routine_mode(a, function, direction); if (function->among_index == 0) { function->among_index = ++x->function_count; } w1->function_index = function->among_index; } else { w1->function = NULL; w1->function_index = 0; if (w1->size == 0) { // This among contains the empty string without a gating // function so it will always match. x->always_matches = true; } } w1++; } else if (q->left == NULL) { /* empty command: () */ w0 = w1; } else { /* Check for previous action which is the same as this one and use * the same action code if we find one. */ int among_result = -1; struct node * action = q; struct amongvec * w; for (w = v; w < w0; ++w) { if (w->action && nodes_equivalent(w->action->left, q->left)) { if (w->result <= 0) { printf("Among code %d isn't positive\n", w->result); exit(1); } action = w->action; among_result = w->result; break; } } if (among_result < 0) { among_result = result++; } while (w0 != w1) { w0->action = action; w0->result = among_result; w0++; } } q = q->right; } if (w1-v != p->number) { fprintf(stderr, "oh! %d %d\n", (int)(w1-v), p->number); exit(1); } x->command_count = result - 1; { NEWVEC(node*, commands, x->command_count); x->same_action = -2; for (int i = 0; i != x->command_count; ++i) commands[i] = NULL; for (w0 = v; w0 < w1; w0++) { if (w0->result > 0) { /* result == -1 when there's no command. */ if (w0->result > x->command_count) { fprintf(stderr, "More among codes than expected\n"); exit(1); } if (!commands[w0->result - 1]) { commands[w0->result - 1] = w0->action; // Check if all actions are a single command of the same // type with a literalstring argument. if (x->same_action > -1) { if (w0->action->left->right || !w0->action->left->literalstring || x->same_action != w0->action->left->type) { x->same_action = -1; } } else if (x->same_action == -2) { if (w0->action->left->right || !w0->action->left->literalstring) { x->same_action = -1; } else { x->same_action = w0->action->left->type; } } } } else { ++x->nocommand_count; } if (backward) reverse_b(w0->b); } x->commands = commands; } qsort(v, w1 - v, sizeof(struct amongvec), compare_amongvec); /* the following loop is O(n squared) */ for (w0 = w1 - 1; w0 >= v; w0--) { symbol * b = w0->b; int size = w0->size; struct amongvec * w; if (size) { if (size < x->shortest_size) x->shortest_size = size; if (size > x->longest_size) x->longest_size = size; } for (w = w0 - 1; w >= v; w--) { if (w->size < size && memcmp(w->b, b, w->size * sizeof(symbol)) == 0) { w0->i = w - v; /* fill in index of longest substring */ break; } } } if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b); for (w0 = v; w0 < w1 - 1; w0++) if (w0->size == (w0 + 1)->size && memcmp(w0->b, (w0 + 1)->b, w0->size * sizeof(symbol)) == 0) { count_error(a); fprintf(stderr, "%s:%d: among(...) has repeated string '", a->tokeniser->file, (w0 + 1)->line_number); report_b(stderr, (w0 + 1)->b); fprintf(stderr, "'\n"); count_error(a); fprintf(stderr, "%s:%d: previously seen here\n", a->tokeniser->file, w0->line_number); } x->literalstring_count = p->number; p->among = x; if (starter) { starter->right = p; p = new_node_at_line(a, c_bra, starter->line_number); if (substring) { p->left = starter; } else { substring = new_node_at_line(a, c_substring, starter->line_number); substring->right = starter; p->left = substring; } } // Clear any among_index values we set so we correctly handle a function // used in more than one among. for (int i = 0; i < x->literalstring_count; i++) { if (v[i].function) { v[i].function->among_index = 0; } } if (x->literalstring_count == 1) { // Eliminate single-case amongs. Sometimes it's the natural way to // express a single rule in Snowball code as it can show commonality // with rulesets with multiple rules, but it's silly to actually // generate as an among. // // We handle an `among` which only has the empty string here - this is // syntactically valid but is not a useful construct so we warn about // it. if (substring) { substring->among = NULL; if (SIZE(v[0].b) == 0) { // substring ... among ( '' (C) ) // // becomes: // // ... (C) fprintf(stderr, "%s:%d: warning: `among` with only empty string always matches\n", a->tokeniser->file, p->line_number); substring->type = c_true; } else { substring->type = c_literalstring; substring->literalstring = v[0].b; } if (v[0].action) { // substring ... among ( S (C) ) // // becomes: // // S ... (C) p = v[0].action; } else { // substring ... among ( S ) // // becomes: // // S ... true p = new_node_at_line(a, c_true, v[0].line_number); } } else { if (v[0].action) { // among ( S (C) ) // // becomes: // // (S C) p = v[0].action; assert(p->type == c_bra); if (SIZE(v[0].b) == 0) { fprintf(stderr, "%s:%d: warning: `among` with only empty string always matches\n", a->tokeniser->file, p->line_number); } else { // Insert a c_literalstring node at the start of (C) struct node * literalstring = new_node(a, c_literalstring); literalstring->literalstring = v[0].b; literalstring->right = p->left; p->left = literalstring; } } else { // among ( S ) // // becomes: // // S if (SIZE(v[0].b) == 0) { fprintf(stderr, "%s:%d: warning: `among` with only empty string always matches\n", a->tokeniser->file, p->line_number); p->type = c_true; } else { p->type = c_literalstring; p->literalstring = v[0].b; } p->left = NULL; } } if (v[0].function) { // If there's an among function, convert the action to: // // FUNC and C struct node * and_node = new_node(a, c_and); and_node->left = new_node(a, c_call); and_node->left->name = v[0].function; and_node->left->right = p; p = and_node; --v[0].function->used_in_among; } FREE(x->commands); FREE(x); FREE(v); return p; } if (x->function_count) { if (a->current_routine) a->current_routine->among_with_function = true; } x->substring = substring; if (substring != NULL) substring->among = x; if (a->amongs == NULL) a->amongs = x; else a->amongs_end->next = x; a->amongs_end = x; return p; } static struct node * read_among(struct analyser * a) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, c_among); struct node * p_end = NULL; int previous_token = -1; struct node * substring = a->substring; a->substring = NULL; p->number = 0; /* counts the number of literals */ if (!get_token(a, c_bra)) return p; while (true) { struct node * q; int token = read_token(t); switch (token) { case c_literalstring: q = read_literalstring(a); if (read_token(t) == c_name) { struct node * r = new_node(a, c_name); name_to_node(a, r, t_routine); q->left = r; } else { hold_token(t); } p->number++; break; case c_bra: if (previous_token == c_bra) { report_error_location(a); fprintf(stderr, "two adjacent bracketed expressions in among(...)\n"); } q = read_C_list(a); if (is_just_true(q->left)) { /* Convert anything equivalent to () to () so we handle it * the same way. */ q->left = NULL; } break; default: unexpected_token_error(a, "among(...)"); previous_token = token; continue; case c_ket: if (p->number == 0) { report_error_location(a); fprintf(stderr, "empty among(...)\n"); } if (t->error_count == 0) p = make_among(a, p, substring); return p; } previous_token = token; if (p_end == NULL) p->left = q; else p_end->right = q; p_end = q; } } static struct node * read_substring(struct analyser * a) { struct node * p = new_node(a, c_substring); if (a->substring != NULL) { substring_without_among_error(a); } a->substring = p; return p; } static void check_modifyable(struct analyser * a) { if (!a->modifyable) { struct tokeniser * t = a->tokeniser; report_error_location(a); fprintf(stderr, "%s not allowed inside reverse(...)\n", name_of_token(t->token)); } } static int ae_uses_name(struct node * p, struct name * q) { if (!p) { // AE is NULL after a syntax error, e.g. `$x = $y` return 0; } switch (p->type) { case c_name: case c_lenof: case c_sizeof: if (p->name == q) return 1; break; case c_neg: return ae_uses_name(p->right, q); case c_multiply: case c_plus: case c_minus: case c_divide: return ae_uses_name(p->left, q) || ae_uses_name(p->right, q); } return 0; } static struct node * read_C(struct analyser * a) { struct tokeniser * t = a->tokeniser; int token = read_token(t); switch (token) { case c_bra: { struct node * p = read_C_list(a); if (p->type != c_bra) { fprintf(stderr, "read_C_list returned unexpected type %s\n", name_of_token(p->type)); exit(1); } if (p->left && !p->left->right) { // Replace a single entry command list with the command it // contains in order to make subsequent optimisations easier. p = p->left; } return p; } case c_backwards: { int mode = a->mode; if (a->mode == m_backward) { report_error_location(a); fprintf(stderr, "'backwards' used when already in this mode\n"); } a->mode = m_backward; struct node * p = new_node(a, token); p->left = read_C(a); a->mode = mode; return p; } case c_reverse: { int mode = a->mode; int modifyable = a->modifyable; a->modifyable = false; a->mode = (mode == m_forward) ? m_backward : m_forward; struct node * p = new_node(a, token); p->left = read_C(a); a->mode = mode; a->modifyable = modifyable; return p; } case c_not: { struct node * subcommand = read_C(a); if (subcommand->type == c_booltest) { /* We synthesise a special command for "not" applied to testing * a boolean variable. */ subcommand->type = c_not_booltest; return subcommand; } struct node * p = new_node(a, token); p->left = subcommand; return p; } case c_try: case c_test: case c_do: case c_repeat: { struct node * p = new_node(a, token); p->left = read_C(a); return p; } case c_fail: { struct node * p = new_node(a, token); p->left = read_C(a); if (!p->left || is_just_true(p->left)) { p->type = c_false; p->left = NULL; } return p; } case c_goto: case c_gopast: { struct node * subcommand = read_C(a); if (subcommand->type == c_grouping || subcommand->type == c_non) { /* We synthesise special commands for "goto" or "gopast" when * used on a grouping or an inverted grouping - the movement of * c by the matching action is exactly what we want! * * Adding the tokens happens to give unique values (the code * would fail to compile if it didn't!) */ switch (token + subcommand->type) { case c_goto + c_grouping: subcommand->type = c_goto_grouping; break; case c_gopast + c_grouping: subcommand->type = c_gopast_grouping; break; case c_goto + c_non: subcommand->type = c_goto_non; break; case c_gopast + c_non: subcommand->type = c_gopast_non; break; default: fprintf(stderr, "Unexpected go/grouping combination: %s %s", name_of_token(token), name_of_token(subcommand->type)); exit(1); } return subcommand; } struct node * p = new_node(a, token); p->left = subcommand; return p; } case c_loop: case c_atleast: { struct node * n = new_node(a, token); n->AE = read_AE(a, NULL, 0); n->left = read_C(a); // n->AE is NULL after a syntax error, e.g. `loop next`. if (n->AE && n->AE->type == c_number) { if (n->AE->number <= 0) { if (token == c_loop) { // `loop N C`, where N <= 0 is a no-op. if (n->AE->fixed_constant) { fprintf(stderr, "%s:%d: warning: `loop %d C` is a no-op\n", t->file, n->AE->line_number, n->AE->number); } n->AE = NULL; n->left = NULL; n->type = c_true; } else { // `atleast N C` where N <= 0 -> `repeat C`. if (n->AE->fixed_constant) { fprintf(stderr, "%s:%d: warning: atleast %d C is just repeat C\n", t->file, n->AE->line_number, n->AE->number); } n->AE = NULL; n->type = c_repeat; } } else if (n->AE->number == 1) { if (token == c_loop) { // `loop 1 C` -> `C`. if (n->AE->fixed_constant) { fprintf(stderr, "%s:%d: warning: loop 1 C is just C\n", t->file, n->AE->line_number); } n = n->left; } } } return n; } case c_setmark: { struct node * n = new_node(a, c_assign); n->AE = new_node(a, c_cursor); if (get_token(a, c_name)) { name_to_node(a, n, t_integer); if (n->name) n->name->initialised = true; } return n; } case c_atmark: { struct node * n = new_node(a, c_eq); struct node * AE = read_AE(a, NULL, 0); if (AE->type == c_cursor) { fprintf(stderr, "%s:%d: warning: `atmark cursor` is always true\n", t->file, n->line_number); n->type = c_true; } else { n->left = new_node_at_line(a, c_cursor, n->line_number); n->AE = AE; } return n; } case c_tomark: { struct node * n = new_node(a, token); struct node * AE = read_AE(a, NULL, 0); if (AE->type == c_cursor) { fprintf(stderr, "%s:%d: warning: `tomark cursor` is a no-op\n", t->file, n->line_number); n->type = c_true; } else { n->AE = AE; } return n; } case c_hop: { struct node * n = new_node(a, token); n->AE = read_AE(a, NULL, 0); // n->AE is NULL after a syntax error, e.g. `hop hop`. if (n->AE && n->AE->type == c_number) { if (n->AE->number == 1) { // Convert `hop 1` to `next`. n->AE = NULL; n->type = c_next; } else if (n->AE->number == 0) { if (n->AE->fixed_constant) { fprintf(stderr, "%s:%d: warning: `hop 0` is a no-op\n", t->file, n->AE->line_number); } n->AE = NULL; n->type = c_true; } else if (n->AE->number < 0) { fprintf(stderr, "%s:%d: warning: hop %d now signals f (as was " "always documented) rather than moving the cursor " "in the opposite direction\n", t->file, n->AE->line_number, n->AE->number); n->AE = NULL; n->type = c_false; } } return n; } case c_delete: check_modifyable(a); /* fall through */ case c_next: case c_tolimit: case c_leftslice: case c_rightslice: case c_true: case c_false: return new_node(a, token); case c_atlimit: { int mode = a->mode; struct node * n = new_node(a, mode == m_forward ? c_ge : c_le); n->left = new_node_at_line(a, c_cursor, n->line_number); n->AE = new_node_at_line(a, c_limit, n->line_number); return n; } case c_debug: a->debug_used = true; return new_node(a, token); case c_assignto: case c_sliceto: { check_modifyable(a); struct node * n = new_node(a, token); if (get_token(a, c_name)) { name_to_node(a, n, t_string); if (n->name) n->name->initialised = true; } if (token == c_assignto) { fprintf(stderr, "%s:%d: warning: Use of `=>` is not recommended, " "see https://snowballstem.org/compiler/snowman.html " "section 13.3 for details\n", t->file, n->line_number); } return n; } case c_assign: token = c_stringassign; /* FALLTHRU */ case c_insert: case c_attach: case c_slicefrom: { check_modifyable(a); struct node * n = new_string_command(a, token); if (n->name) { n->name->value_used = true; } else if (SIZE(n->literalstring) == 0) { switch (token) { case c_insert: case c_attach: fprintf(stderr, "%s:%d: warning: `%s ''` is a no-op\n", t->file, n->line_number, name_of_token(token)); n->type = c_true; n->literalstring = NULL; break; case c_slicefrom: // Canonicalise `<-''` to `delete`. n->type = c_delete; n->literalstring = NULL; } } return n; } case c_setlimit: { struct node * n = new_node(a, token); n->left = read_C(a); get_token(a, c_for); n->aux = read_C(a); if (n->left->type == c_tomark && n->left->AE->type == c_limit) { fprintf(stderr, "%s:%d: warning: `setlimit tomark limit` is a no-op\n", t->file, n->line_number); return n->aux; } return n; } case c_set: case c_unset: { struct node * n = new_node(a, token); if (get_token(a, c_name)) { name_to_node(a, n, t_boolean); if (n->name) n->name->initialised = true; } return n; } case c_dollar: { int dollar_line = t->line_number; read_token(t); if (t->token == c_bra) { /* Handle newer $(AE REL_OP AE) syntax. */ struct node * n = read_AE(a, NULL, 0); read_token(t); token = t->token; bool eval_constant_expr = false; switch (token) { case c_assign: // Assume `==` was meant to try to avoid an error avalanche. token = c_eq; report_assumed_rel_op_error: report_error_location(a); fprintf(stderr, "Expected relational operator, got '%s' (did you mean '%s'?)\n", name_of_token(t->token), name_of_token(token)); goto handle_rel_op; case c_assignto: // Assume `>=` was meant to try to avoid an error avalanche. // (`=>` instead of `>=` is a possible typo.) token = c_ge; goto report_assumed_rel_op_error; case c_divideassign: // Assume `!=` was meant to try to avoid an error avalanche. // (Ada, Erlang, Fortran90, etc use `/=` for not-equal.) token = c_ne; goto report_assumed_rel_op_error; case c_minusassign: case c_multiplyassign: case c_plusassign: // Give a better error if any other assignment operator // is used in this context. report_error_location(a); fprintf(stderr, "Expected relational operator, got '%s'\n", name_of_token(token)); // Assume `==` was meant to try to avoid an error avalanche. token = c_eq; goto handle_rel_op; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: { // Only evaluate constant expressions if we got a valid // relational operator to avoid spurious unreachable // code warnings after an error. eval_constant_expr = true; handle_rel_op: ; struct node * lhs = n; struct node * rhs = read_AE(a, NULL, 0); if (eval_constant_expr && lhs->type == c_number && rhs->type == c_number) { // Evaluate constant numeric test expression. int result; switch (token) { case c_eq: result = (lhs->number == rhs->number); break; case c_ne: result = (lhs->number != rhs->number); break; case c_gt: result = (lhs->number > rhs->number); break; case c_ge: result = (lhs->number >= rhs->number); break; case c_lt: result = (lhs->number < rhs->number); break; case c_le: result = (lhs->number <= rhs->number); break; default: fprintf(stderr, "Unexpected numeric test operator %s\n", name_of_token(t->token)); exit(1); } n = new_node(a, result ? c_true : c_false); } else { n = new_node(a, token); n->left = lhs; n->AE = rhs; } get_token(a, c_ket); break; } default: unexpected_token_error(a, "integer test expression"); hold_token(t); (void)read_AE(a, NULL, 0); get_token(a, c_ket); break; } return n; } if (t->token != c_name) { unexpected_token_error(a, "integer test expression"); hold_token(t); return new_node_at_line(a, c_dollar, dollar_line); } struct name * q = find_name(a); if (q && q->type == t_string) { /* Assume for now that $ on string both initialises and uses * the string variable. FIXME: Can we do better? */ q->initialised = true; q->value_used = true; struct node * p = new_node_at_line(a, c_dollar, dollar_line); int mode = a->mode; int modifyable = a->modifyable; a->mode = m_forward; a->modifyable = true; p->left = read_C(a); a->mode = mode; a->modifyable = modifyable; p->name = q; mark_used_in(a, q, p); return p; } if (q && q->type != t_integer) { /* If $ is used on an unknown name or a name which isn't a * string or an integer then we assume the unknown name is an * integer as $ is used more often on integers than strings, so * hopefully this it less likely to cause an error avalanche. * * For an unknown name, we'll already have reported an error. */ report_error_location(a); fprintf(stderr, "'%.*s' not of type integer or string\n", SIZE(q->s), q->s); q = NULL; } struct node * p = new_node(a, read_AE_test(a)); switch (p->type) { case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: p->left = new_node(a, c_name); p->left->name = q; p->AE = read_AE(a, NULL, 0); if (q) { q->value_used = true; mark_used_in(a, q, p); } return p; } /* +=, etc don't "initialise" as they only amend an existing value. * Similarly, they don't count as using the value. */ p->name = q; p->AE = read_AE(a, q, 0); if (p->AE && p->AE->type == c_number) { switch (p->type) { case c_plusassign: case c_minusassign: if (p->AE->number == 0) { // `$x+=0` and `$x-=0` are no-ops. p->type = c_true; p->name = NULL; p->AE = NULL; } else if (p->AE->number < 0) { // `$x+=-N` -> `$x-=N`, etc as // this may result in slightly // shorter target language code. p->type ^= (c_plusassign ^ c_minusassign); p->AE->number = -p->AE->number; } break; case c_multiplyassign: case c_divideassign: if (p->AE->number == 1) { // `$x*=1` and `$x/=1` are no-ops. p->type = c_true; p->name = NULL; p->AE = NULL; } else if (p->AE->number == 0) { if (p->type == c_divideassign) { report_error_location_line(a, p->line_number); fprintf(stderr, "Division by zero\n"); // Set the largest possible value. p->type = c_maxint; p->name = NULL; p->AE = NULL; } else { // `$x*=0` -> `$x=0` p->type = c_assign; } } else if (p->AE->number == -1) { // `$x/=-1` -> `$x=-x` // `$x*=-1` -> `$x=-x` p->type = c_assign; p->AE->number = 0; p->AE->type = c_neg; p->AE->right = new_node_at_line(a, c_name, p->AE->line_number); p->AE->right->name = p->name; } break; } } if (p->type == c_assign && q) { if (ae_uses_name(p->AE, q)) { if (p->AE->type == c_name) { fprintf(stderr, "%s:%d: warning: `$", t->file, p->line_number); report_s(stderr, p->AE->name->s); fprintf(stderr, " = "); report_s(stderr, p->AE->name->s); fprintf(stderr, "` is a no-op\n"); p->AE = NULL; p->type = c_true; q = NULL; } } else { // `$x = ` initialises `x` unless AE references `x`. q->initialised = true; } } if (q) mark_used_in(a, q, p); return p; } case c_name: { struct name * q = find_name(a); struct node * p = new_node(a, c_name); if (q) { mark_used_in(a, q, p); switch (q->type) { case t_boolean: p->type = c_booltest; q->value_used = true; break; case t_integer: report_error_location(a); fprintf(stderr, "integer name '%.*s' misplaced\n", SIZE(t->s), t->s); break; case t_string: q->value_used = true; break; case t_routine: case t_external: p->type = c_call; check_routine_mode(a, q, a->mode); break; case t_grouping: p->type = c_grouping; break; } } p->name = q; return p; } case c_non: { struct node * p = new_node(a, token); read_token(t); if (t->token == c_minus) read_token(t); if (!check_token(a, c_name)) { return p; } name_to_node(a, p, t_grouping); return p; } case c_literalstring: { struct node * p = read_literalstring(a); if (SIZE(p->literalstring) == 0) { fprintf(stderr, "%s:%d: warning: empty literal string is a no-op\n", t->file, p->line_number); p->type = c_true; p->literalstring = NULL; } return p; } case c_among: return read_among(a); case c_substring: return read_substring(a); default: unexpected_token_error(a, 0); return NULL; } } static int next_symbol(symbol * p, symbol * W, int utf8) { if (utf8) { int ch; int j = get_utf8(p, & ch); *W = ch; return j; } else { *W = *p; return 1; } } static symbol * alter_grouping(symbol * p, symbol * q, int style, int utf8) { int j = 0; if (style == c_plus) { while (j < SIZE(q)) { symbol W; int width = next_symbol(q + j, &W, utf8); p = add_symbol_to_b(p, W); j += width; } } else { while (j < SIZE(q)) { symbol W; int width = next_symbol(q + j, &W, utf8); for (int i = 0; i < SIZE(p); i++) { if (p[i] == W) { memmove(p + i, p + i + 1, (SIZE(p) - i - 1) * sizeof(symbol)); ADD_TO_SIZE(p, -1); } } j += width; } } return p; } static int compare_symbol(const void *pv, const void *qv) { const symbol * p = (const symbol*)pv; const symbol * q = (const symbol*)qv; return *p - *q; } static int finalise_grouping(struct grouping * p) { if (SIZE(p->b) == 0) { // Empty grouping - leave things in a non-surprising state. p->smallest_ch = p->largest_ch = 0; return false; } qsort(p->b, SIZE(p->b), sizeof(symbol), compare_symbol); p->smallest_ch = p->b[0]; p->largest_ch = p->b[SIZE(p->b) - 1]; // Eliminate duplicates. symbol ch = p->b[0]; int j = 1; for (int i = 1; i < SIZE(p->b); i++) { if (p->b[i] != ch) { ch = p->b[j++] = p->b[i]; } } SET_SIZE(p->b, j); return true; } static void read_define_grouping(struct analyser * a, struct name * q) { struct tokeniser * t = a->tokeniser; int style = c_plus; bool check_nonempty = true; { NEW(grouping, p); *p = (struct grouping){0}; if (a->groupings == NULL) a->groupings = p; else a->groupings_end->next = p; a->groupings_end = p; if (q) { if (q->grouping != NULL) { report_error_location(a); fprintf(stderr, "'%.*s' redefined\n", SIZE(t->s), t->s); q->grouping->name = NULL; } q->grouping = p; } p->name = q; p->line_number = t->line_number; p->b = create_b(0); do { switch (read_token(t)) { case c_name: { struct name * r = find_name(a); if (!r) break; if (r == q) { count_error(a); fprintf(stderr, "%s:%d: %.*s defined in terms of itself\n", t->file, t->line_number, SIZE(r->s), r->s); check_nonempty = false; } else if (!r->grouping) { if (check_name_type(a, r, t_grouping)) { count_error(a); fprintf(stderr, "%s:%d: %.*s undefined\n", t->file, t->line_number, SIZE(r->s), r->s); } check_nonempty = false; } else { p->b = alter_grouping(p->b, r->grouping->b, style, false); } r->used_in_definition = true; break; } case c_literalstring: { int utf8 = (a->encoding == ENC_UTF8); int i = 0; while (i < SIZE(t->b)) { symbol ch_i; int width_i = next_symbol(t->b + i, &ch_i, utf8); int j = 0; while (j < i) { symbol ch_j; int width_j = next_symbol(t->b + j, &ch_j, utf8); if (ch_i == ch_j) { fprintf(stderr, "%s:%d: warning: Duplicate " "character in grouping: ", t->file, t->line_number); if (ch_i >= 32 && ch_i < 127) { fprintf(stderr, "'%c'\n", ch_i); } else { fprintf(stderr, "U+%04X\n", ch_i); } } j += width_j; } i += width_i; } p->b = alter_grouping(p->b, t->b, style, (a->encoding == ENC_UTF8)); break; } default: unexpected_token_error(a, "grouping definition"); hold_token_if_toplevel(t); // Don't report an error for an empty grouping as well. (void)finalise_grouping(p); return; } style = read_token(t); } while (style == c_plus || style == c_minus); if (!finalise_grouping(p)) { if (check_nonempty) { report_error_location_line(a, p->line_number); fprintf(stderr, "empty grouping\n"); } } hold_token(t); } } static void read_define_routine(struct analyser * a, struct name * q) { struct node * p = new_node(a, c_define); a->current_routine = q; if (q) { if (q->definition != NULL) { report_error_location(a); fprintf(stderr, "'%.*s' redefined\n", SIZE(q->s), q->s); } if (q->mode == m_unknown) { q->mode = a->mode; } else if (q->mode != a->mode) { report_error_location(a); fprintf(stderr, "'%.*s' declared as %s mode; used as %s mode", SIZE(q->s), q->s, name_of_mode(a->mode), name_of_mode(q->mode)); } } p->name = q; if (a->program == NULL) a->program = p; else a->program_end->right = p; a->program_end = p; get_token(a, c_as); p->left = read_C(a); if (q) q->definition = p; /* We should get a node with a NULL right pointer from read_C() for the * routine's code. We synthesise a "functionend" node there so * optimisations such as dead code elimination and tail call optimisation * can easily see where the function ends. */ assert(p->left->right == NULL); if (p->left->type == c_bra) { /* Put the "functionend" node at the end of the command list. */ struct node * e = p->left->left; if (e) { while (e->right) e = e->right; e->right = new_node(a, c_functionend); } else { p->left = new_node(a, c_functionend); } } else { /* Put the "functionend" node after the single command. */ p->left->right = new_node(a, c_functionend); } if (a->substring != NULL) { substring_without_among_error(a); a->substring = NULL; } a->current_routine = NULL; } static void read_define(struct analyser * a) { if (!get_token(a, c_name)) return; struct name * q = find_name(a); int type; if (q) { type = q->type; if (type != t_grouping && type != t_routine && type != t_external) { // If integer, boolean, or string name then generate error and // parse based on the next token. type = (peek_token(a->tokeniser) == c_as) ? t_routine : t_grouping; check_name_type(a, q, type); } } else { /* No declaration so sniff next token - if it is a string or name * we parse as a grouping, otherwise we parse as a routine. This * avoids an avalanche of further errors if `as` is missing from a * routine definition. */ switch (peek_token(a->tokeniser)) { case c_literalstring: case c_name: type = t_grouping; break; default: type = t_routine; } } if (type == t_grouping) { read_define_grouping(a, q); } else { read_define_routine(a, q); } } static void read_backwardmode(struct analyser * a) { int mode = a->mode; a->mode = m_backward; if (get_token(a, c_bra)) { read_program_(a, c_ket); } a->mode = mode; } static void read_program_(struct analyser * a, int terminator) { struct tokeniser * t = a->tokeniser; while (true) { int token = read_token(t); switch (token) { case c_strings: read_names(a, t_string); break; case c_booleans: read_names(a, t_boolean); break; case c_integers: read_names(a, t_integer); break; case c_routines: read_names(a, t_routine); break; case c_externals: read_names(a, t_external); break; case c_groupings: read_names(a, t_grouping); break; case c_define: read_define(a); break; case c_backwardmode:read_backwardmode(a); break; default: if (token == terminator) return; unexpected_token_error(a, 0); break; case -1: if (terminator != -1) omission_error(a, terminator); return; } } } static void remove_dead_assignments(struct node * p, struct name * q) { if (p->name == q) { switch (p->type) { case c_assignto: case c_sliceto: case c_assign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_set: case c_unset: case c_dollar: /* c_true is a no-op. */ p->type = c_true; p->AE = NULL; p->name = NULL; break; default: /* There are no read accesses to this variable, so any * references must be assignments. */ fprintf(stderr, "Unhandled type of dead assignment via %s\n", name_of_token(p->type)); exit(1); } } if (p->AE) remove_dead_assignments(p->AE, q); if (p->left) remove_dead_assignments(p->left, q); if (p->aux) remove_dead_assignments(p->aux, q); if (p->right) remove_dead_assignments(p->right, q); } enum { // Not set on at least one code path leading to a use. USE_BEFORE_SET, // Need to keep checking. UNKNOWN, // Set on any code path leading to a use. SET_BEFORE_ANY_USE }; /* Find out if every codepath in the command with node p to a use of variable v * sets v first. * * The checks err towards being too conservative and may report that v can't be * safely localised when it can, but they allow localising all variables which * can be trivially made local in existing stemmers. * * p: the node of the command to check. * func: the c_define of the routine/external this code is in. * v: the variable to check. */ static int always_set_before_use_(struct node * p, struct node * func, struct name * v) { if (!p) return UNKNOWN; switch (p->type) { case c_call: { if (p->name->definition == func) { /* We've recursed into the function we're considering * localising this variable into, which means we can't * localise it because then changes to the variable in * the nested call won't be reflected after it returns. */ return USE_BEFORE_SET; } // We know v is only referenced in the function we are checking. return UNKNOWN; } case c_among: { bool all_pass = true; struct among * x = p->among; for (int i = 1; i <= x->command_count; i++) { int r = always_set_before_use_(x->commands[i - 1], func, v); if (r == USE_BEFORE_SET) return r; all_pass = all_pass && (r == SET_BEFORE_ANY_USE); } if (all_pass) return SET_BEFORE_ANY_USE; return UNKNOWN; } case c_or: { struct node * q = p->left; bool all_pass = true; while (q) { int r = always_set_before_use_(q, func, v); if (r == USE_BEFORE_SET) return r; all_pass = all_pass && (r == SET_BEFORE_ANY_USE); q = q->right; } if (all_pass) return SET_BEFORE_ANY_USE; return UNKNOWN; } case c_and: case c_bra: { struct node * q = p->left; while (q) { int r = always_set_before_use_(q, func, v); if (r != UNKNOWN) return r; q = q->right; } return UNKNOWN; } case c_backwards: case c_not: case c_reverse: case c_test: return always_set_before_use_(p->left, func, v); case c_do: case c_fail: case c_gopast: case c_goto: case c_try: case c_repeat: { if (always_set_before_use_(p->left, func, v) == USE_BEFORE_SET) return USE_BEFORE_SET; return UNKNOWN; } case c_atleast: case c_loop: if (always_set_before_use_(p->AE, func, v) == USE_BEFORE_SET) return USE_BEFORE_SET; return always_set_before_use_(p->left, func, v); case c_assign: // Check AE first: `x = x + 1` uses `x` before it sets it. if (always_set_before_use_(p->AE, func, v) == USE_BEFORE_SET) return USE_BEFORE_SET; if (p->name == v) return SET_BEFORE_ANY_USE; return UNKNOWN; case c_assignto: case c_set: case c_sliceto: case c_unset: if (p->name == v) return SET_BEFORE_ANY_USE; return UNKNOWN; case c_delete: case c_grouping: case c_leftslice: case c_literalstring: case c_next: case c_non: case c_number: case c_rightslice: case c_debug: case c_substring: case c_tolimit: case c_false: case c_true: case c_goto_grouping: case c_gopast_grouping: case c_goto_non: case c_gopast_non: return UNKNOWN; case c_hop: case c_tomark: if (always_set_before_use_(p->AE, func, v) == USE_BEFORE_SET) return USE_BEFORE_SET; return UNKNOWN; case c_stringassign: case c_attach: case c_booltest: case c_insert: case c_name: case c_not_booltest: case c_slicefrom: if (p->name == v) { return USE_BEFORE_SET; } return UNKNOWN; case c_functionend: return SET_BEFORE_ANY_USE; case c_divide: case c_minus: case c_multiply: case c_plus: case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: { int r = always_set_before_use_(p->left, func, v); if (r != UNKNOWN) return r; return always_set_before_use_(p->right, func, v); } case c_neg: return always_set_before_use_(p->right, func, v); case c_lenof: case c_sizeof: if (p->name == v) { return USE_BEFORE_SET; } return UNKNOWN; case c_cursor: case c_len: case c_limit: case c_maxint: case c_minint: case c_size: return UNKNOWN; case c_setlimit: { int r = always_set_before_use_(p->aux, func, v); if (r != UNKNOWN) return r; return always_set_before_use_(p->left, func, v); } case c_divideassign: case c_minusassign: case c_multiplyassign: case c_plusassign: if (p->name == v) { return USE_BEFORE_SET; } if (always_set_before_use_(p->AE, func, v) == USE_BEFORE_SET) { return USE_BEFORE_SET; } return UNKNOWN; case c_dollar: if (p->name != v) { return UNKNOWN; } #if 0 // This check is valid, but currently it's better to not treat // initialising uses of string-$ as definitely setting the string // variable because for some target languages that means we need to // initialise to an empty string at the start of the function and // we would incur overhead from doing so. if (p->left->type == c_stringassign) { // Special-case `$x = S` because it's easy to handle. return SET_BEFORE_ANY_USE; } #endif // Otherwise, for now we assume that `$x C` might use `x` before // setting it. If string-$ sees wider use we can do better here. return USE_BEFORE_SET; case c_backwardmode: case c_define: // We always start from c_define's ->left. case c_booleans: case c_externals: case c_groupings: case c_integers: case c_routines: case c_strings: // Allowing these would allow checking the whole program. assert(0); return UNKNOWN; case c_comment1: case c_comment2: case c_decimal: case c_get: case c_hex: case c_stringdef: case c_stringescapes: // These are only use in the tokeniser. assert(0); break; case c_atlimit: case c_atmark: case c_setmark: // These are only use in the tokeniser and analyser. assert(0); break; case c_as: case c_for: case c_ket: // These shouldn't occur in this context. assert(0); break; } /* Pessimistic assumption for cases we don't handle yet. */ printf("Assuming the worst about '%s' (%d)\n", name_of_token(p->type), p->type); return USE_BEFORE_SET; } static int always_set_before_use(struct node * p, struct node * func, struct name * v) { return always_set_before_use_(p, func, v) != USE_BEFORE_SET; } static void remove_unreachable_routine(struct analyser * a, struct name * q) { struct node ** ptr = &(a->program); while (*ptr) { if ((*ptr)->name == q) { *ptr = (*ptr)->right; } else { ptr = &((*ptr)->right); } } } // Return 0 for always f. // Return 1 for always t. // Return -1 for don't know (or can raise t or f). static int check_possible_signals(struct analyser * a, struct node * p) { switch (p->type) { case c_fail: case c_false: /* Always gives signal f. */ return 0; case c_stringassign: case c_attach: case c_debug: case c_delete: case c_do: case c_insert: case c_leftslice: case c_rightslice: case c_set: case c_slicefrom: case c_sliceto: case c_tolimit: case c_tomark: case c_true: case c_try: case c_unset: case c_assign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_functionend: /* Always gives signal t. */ return 1; case c_repeat: { int possible_signals = p->left->possible_signals; if (possible_signals == 1) { fprintf(stderr, "%s:%d: warning: infinite loop: body of 'repeat' always signals 't'\n", a->tokeniser->file, p->line_number); // Any code after this is unreachable. // FIXME: This only prunes the rest of this command list - we // want to prune anything that's only reachable from here. p->right = NULL; } else if (possible_signals == 0) { fprintf(stderr, "%s:%d: warning: body of 'repeat' always signals 'f'\n", a->tokeniser->file, p->line_number); p->type = c_do; } /* Always gives signal t. */ return 1; } case c_not: { // `not` signals the opposite to the command it is applied to. int res = p->left->possible_signals; if (res < 0) { // `not` applied to command which can signal `t` or `f`. return res; } if (res == 0) { fprintf(stderr, "%s:%d: warning: 'not' applied to command which always signals f\n", a->tokeniser->file, p->line_number); // Handling the failure will restore the cursor, so equivalent to `do`. p->type = c_do; return 1; } fprintf(stderr, "%s:%d: warning: 'not' applied to command which always signals t\n", a->tokeniser->file, p->line_number); // This `not` is equivalent to `fail`. p->type = c_fail; return 0; } case c_setlimit: { /* If either always signals f, setlimit does too. */ int res = p->left->possible_signals; int res2 = p->aux->possible_signals; if (res == 0 || res2 == 0) { return 0; } // If both always signal t, setlimit does too. Otherwise we know at // least one is unknown and that means setlimit's signal is unknown. // We can achieve that with a simple bitwise or. return res | res2; } case c_and: case c_bra: { struct node * q = p->left; int r = 1; while (q) { int res = q->possible_signals; if (res == 0) { // If any command always signals f, then the list always // signals f. if (q->right) { if (q->right->type != c_functionend) { fprintf(stderr, "%s:%d: warning: command always signals f here so rest of %s is unreachable\n", a->tokeniser->file, q->line_number, (p->type == c_and ? "'and'" : "command list")); } q->right = NULL; } return res; } if (res < 0) r = res; q = q->right; } return r; } case c_backwards: case c_dollar: case c_loop: case c_reverse: case c_test: /* Give same signal as p->left. */ return p->left->possible_signals; case c_atleast: { int possible_signals = p->left->possible_signals; if (possible_signals == 1) { fprintf(stderr, "%s:%d: warning: infinite loop: body of 'atleast' always signals 't'\n", a->tokeniser->file, p->line_number); // Any code after this is unreachable. // FIXME: This only prunes the rest of this command list - we // want to prune anything that's only reachable from here. p->right = NULL; } else if (possible_signals == 0) { fprintf(stderr, "%s:%d: warning: body of 'atleast' always signals 'f'\n", a->tokeniser->file, p->line_number); p->type = c_bra; p->AE = NULL; } /* Give same signal as p->left. */ return possible_signals; } case c_call: // If the call recurses back into the current routine then this // will still be -1. return p->name->definition->possible_signals; case c_gopast: case c_goto: case c_goto_grouping: case c_gopast_grouping: case c_goto_non: case c_gopast_non: /* FIXME: unless we can prove that c is either definitely atlimit * or definitely not atlimit... */ return -1; case c_booltest: case c_not_booltest: case c_hop: case c_literalstring: case c_next: case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: case c_grouping: case c_non: case c_name: /* FIXME: unless we can prove... */ return -1; case c_substring: { struct among * x = p->among; if (x->always_matches) { return 1; } return -1; } case c_among: { struct among * x = p->among; int r = 1; if (x->substring == NULL) { if (!x->always_matches) { r = -1; } } if (x->command_count > 0) { bool trues = (x->nocommand_count > 0); bool falses = false; for (int i = 1; i <= x->command_count; i++) { int res = x->commands[i - 1]->possible_signals; if (res == 0) { falses = true; } else if (res > 0) { trues = true; } else { falses = trues = true; } if (falses && trues) break; } if (!trues) { // All commands in among always fail. return 0; } if (falses) { // Commands in among can succeed or fail. return -1; } } return r; } case c_or: { int r = 0; for (struct node * q = p->left; q; q = q->right) { // Just check this node - q->right is a separate clause of // the OR. int res = q->possible_signals; if (res > 0) { // If any clause of the OR always signals t, then the OR // always signals t. if (q->right) { if (q->right->type != c_functionend) { fprintf(stderr, "%s:%d: warning: command always signals t here so rest of 'or' is unreachable\n", a->tokeniser->file, q->line_number); } q->right = NULL; } return 1; } if (res < 0) { r = res; } } return r; } default: return -1; } } static void visit_routine(struct analyser * a, struct name * n); static void visit_node(struct analyser * a, struct node * p) { while (p) { if (p->name) { if (p->type == c_call) { visit_routine(a, p->name); } else { // Mark as reachable. p->name->count = -2; } } else if (p->type == c_among) { struct among * x = p->among; x->used = true; for (int i = 0; i < x->literalstring_count; ++i) { if (x->b[i].function) visit_routine(a, x->b[i].function); } for (int i = 0; i < x->command_count; ++i) { visit_node(a, x->commands[i]); } } if (p->left) { visit_node(a, p->left); } if (p->aux) { visit_node(a, p->aux); } if (p->AE) { visit_node(a, p->AE); } p->possible_signals = check_possible_signals(a, p); if ((p->type == c_and || p->type == c_or) && !p->left->right) { // Pruning of unreachable code can leave single-entry c_and and // c_or nodes. These can lead to unused variables in the generated // code and may also hinder further optimisations. // // We want to replace these with their subnode. It's fiddly to do // an actual replacement as we'd need to update the location we got // the current value of `p` from, so instead we swap the contents // of the two nodes. Note that we must not swap the existing ->next // pointers as that could break the chain of all allocated nodes we // have from analyser->nodes. struct node * p_left = p->left; struct node * p_right = p->right; struct node * p_next = p->next; struct node tmp = *p; tmp.next = p_left->next; *p = *p_left; p->right = p_right; p->next = p_next; *p_left = tmp; if (p->type == c_among) { // Update struct among's node pointer. for (struct among * q = a->amongs; q; q = q->next) { if (q->node == p_left) { q->node = p; break; } } } } p = p->right; } } static void visit_routine(struct analyser * a, struct name * n) { if (n->count == -2) { // Already visited. We set n->count before walking the definition so // this also prevents the walk from reentering a routine via recursive // calls. return; } n->count = -2; struct node * p = n->definition; // Recursive functions are valid in the Snowball language, but aren't // actually used in typical snowball programs so we take a simple // approach and handle them by setting pessimistic assumptions here which // will be used if a function calls itself (directly or indirectly). p->possible_signals = -1; // Assume it could signal t or f. visit_node(a, p->left); // Update with calculated value. p->possible_signals = p->left->possible_signals; } extern void read_program(struct analyser * a, unsigned localise_mask) { read_program_(a, -1); for (struct name * q = a->names; q; q = q->next) { // Declaring but not defining is only an error if used. We'll issue // a warning later on if there are no errors. if (!q->used) continue; bool error = false; switch (q->type) { case t_external: case t_routine: error = (q->definition == NULL); break; case t_grouping: error = (q->grouping == NULL); break; } if (error) { count_error(a); fprintf(stderr, "%s:%d: %s '%.*s' declared but not defined\n", a->tokeniser->file, q->used->line_number, name_of_type(q->type), SIZE(q->s), q->s); } } // Skip name warning checks if there are errors. if (a->tokeniser->error_count) return; for (struct name * n = a->names; n; n = n->next) { if (n->type == t_external) { if (!n->used) { // Externals can be called from outside of Snowball, so if they // aren't already marked as used we set the `used` field to // point to the definition so we can just check this field // later. n->used = n->definition; } visit_routine(a, n); } } for (struct name * q = a->names; q; q = q->next) { if (q->references == 0) { fprintf(stderr, "%s:%d: warning: %s '%.*s' ", a->tokeniser->file, q->declaration_line_number, name_of_type(q->type), SIZE(q->s), q->s); if (q->type == t_routine || q->type == t_external || q->type == t_grouping) { fprintf(stderr, "declared but not defined\n"); } else { fprintf(stderr, "declared but not used\n"); } q->used = NULL; continue; } if (q->type == t_routine || q->type == t_grouping) { /* It's OK to define a grouping but only use it to define other * groupings. */ if (!q->used && !q->used_in_definition) { int line_num; if (q->type == t_routine) { line_num = q->definition->line_number; } else { line_num = q->grouping->line_number; } fprintf(stderr, "%s:%d: warning: %s '%.*s' defined but not used\n", a->tokeniser->file, line_num, name_of_type(q->type), SIZE(q->s), q->s); continue; } } if (q->type < t_routine) { if (!q->initialised) { fprintf(stderr, "%s:%d: warning: %s '%.*s' is never initialised\n", a->tokeniser->file, q->declaration_line_number, name_of_type(q->type), SIZE(q->s), q->s); } else if (!q->value_used) { fprintf(stderr, "%s:%d: warning: %s '%.*s' is set but never used\n", a->tokeniser->file, q->declaration_line_number, name_of_type(q->type), SIZE(q->s), q->s); remove_dead_assignments(a->program, q); q->used = NULL; continue; } } if (q->count == -1) { // Used but use is not reachable by calling any externals so // suppress all code generation for this name. // // We only issue a warning about unreachability for routines here // to avoid excess diagnostics, since other types must be used in a // routine which is not reachable (or will have been warned about as // unused by the check above). if (q->type == t_routine) { fprintf(stderr, "%s:%d: warning: %s '%.*s' not reachable from any externals\n", a->tokeniser->file, q->declaration_line_number, name_of_type(q->type), SIZE(q->s), q->s); remove_unreachable_routine(a, q); } q->used = NULL; } } /* We've now identified variables whose values are never used and * names which are unreachable, and cleared "used" for them, so go * through and unlink the unused ones. */ struct name * n = a->names; struct name ** n_ptr = &(a->names); while (n) { if (!n->used) { if (n->grouping) { // Clear the name field then loop through and remove from // the groupings list just below. n->grouping->name = NULL; } else if (n->definition) { remove_unreachable_routine(a, n); } struct name * n_next = n->next; lose_s(n->s); FREE(n); n = n_next; *n_ptr = n; continue; } n_ptr = &(n->next); n = n->next; } // Remove groupings which aren't used. struct grouping * g = a->groupings; struct grouping ** g_ptr = &(a->groupings); while (g) { if (!g->name) { struct grouping * g_next = g->next; lose_b(g->b); FREE(g); g = g_next; *g_ptr = g; continue; } g_ptr = &(g->next); g = g->next; } // Remove amongs which are in unreachable routines from the list // and number the others. { int among_count = 0; struct among ** a_ptr = &(a->amongs); while (*a_ptr) { struct among * x = *a_ptr; if (!x->used) { *a_ptr = x->next; continue; } x->number = among_count++; if (x->function_count > 0) ++a->among_with_function_count; for (int i = 1; i <= x->command_count; i++) { int merge_with = 0; struct node * command = x->commands[i - 1]; assert(command->type == c_bra); if (!command->left || is_just_true(command->left)) { // Optimisation has turned this action into a no-op. command->left = NULL; merge_with = -1; } else { for (int k = 1; k < i; ++k) { if (nodes_equivalent(command->left, x->commands[k - 1]->left)) { // Optimisation has made this action equivalent // to an earlier one. merge_with = k; break; } } } if (!merge_with) continue; // Update references to this command index to be `merge_with` // and subtract one from references to command indexes after // this one. for (int j = 0; j < x->literalstring_count; ++j) { int diff = (x->b[j].result - i); if (diff == 0) { x->b[j].result = merge_with; if (merge_with == 0) { assert(x->b[j].action->type == c_bra); x->b[j].action->left = NULL; } } else if (diff > 0) { --x->b[j].result; } } memmove(x->commands + (i - 1), x->commands + i, sizeof(x->commands[0]) * (x->command_count - i)); --x->command_count; ++x->nocommand_count; --i; } if (x->command_count > 1 || (x->command_count == 1 && x->nocommand_count > 0)) { /* We need to set among_var rather than just checking if * find_among*() returns zero or not. */ x->amongvar_needed = true; if (x->in_routine) x->in_routine->amongvar_needed = true; } a_ptr = &(x->next); } } /* Localise variables. * * We localise variables which are only referenced in a single function * (routine or external) and which are always set before being read within * that function (since a function could rely on a variable's previous * value surviving). * * We could potentially localise variables referenced in multiple functions * provided that they are always set before use in every function they are * referenced in, and that these functions don't call one another, but that * situation doesn't occur in any of the stemmers we currently ship. */ memset(a->name_count, 0, sizeof(a->name_count)); for (struct name * name = a->names; name; name = name->next) { if (name->local_to != NULL) { if (localise_mask & (1 << name->type)) { struct node * func = name->local_to->definition; if (!always_set_before_use(func->left, func, name)) { fprintf(stderr, "%s:%d: info: Could not localise %s `%.*s` to routine `%.*s`\n", a->tokeniser->file, func->line_number, name_of_type(name->type), SIZE(name->s), name->s, SIZE(func->name->s), func->name->s); report_s(stderr, name->s); fprintf(stderr, "\n"); name->local_to = NULL; } } else { name->local_to = NULL; } } if (name->local_to == NULL) { name->count = a->name_count[name->type]++; } } a->variable_count = a->name_count[t_string] + a->name_count[t_boolean] + a->name_count[t_integer]; // Now number the locals (which e.g. Ada and Pascal use to avoid clashes // from case-insensitive variable names). We use a copy of the counters // to do this so that a->name_count[] reflects the number of non-localised // variables of each type. int name_count[t_size]; memcpy(name_count, a->name_count, sizeof(name_count)); for (struct name * name = a->names; name; name = name->next) { if (name->count < 0) { name->count = name_count[name->type]++; } } } extern struct analyser * create_analyser(struct tokeniser * t) { NEW(analyser, a); *a = (struct analyser){0}; a->tokeniser = t; a->mode = m_forward; a->modifyable = true; return a; } extern void close_analyser(struct analyser * a) { { struct node * q = a->nodes; while (q) { struct node * q_next = q->next; FREE(q); q = q_next; } } { struct name * q = a->names; while (q) { struct name * q_next = q->next; lose_s(q->s); FREE(q); q = q_next; } } { struct literalstring * q = a->literalstrings; while (q) { struct literalstring * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; } } { struct among * q = a->amongs; while (q) { struct among * q_next = q->next; FREE(q->b); FREE(q->commands); FREE(q); q = q_next; } } { struct grouping * q = a->groupings; while (q) { struct grouping * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; } } FREE(a); } snowball-3.1.0/compiler/driver.c000066400000000000000000000657331520373054300166250ustar00rootroot00000000000000#include /* for toupper etc */ #include /* for fprintf etc */ #include /* for free etc */ #include /* for strcmp */ #include "header.h" #define DEFAULT_JAVA_PACKAGE "org.tartarus.snowball.ext" #define DEFAULT_JAVA_BASE_CLASS "org.tartarus.snowball.SnowballProgram" #define DEFAULT_JAVA_AMONG_CLASS "org.tartarus.snowball.Among" #define DEFAULT_JAVA_STRING_CLASS "java.lang.StringBuilder" #define DEFAULT_DART_BASE_CLASS "SnowballProgram" #define DEFAULT_GO_PACKAGE "snowball" #define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go" #define DEFAULT_ADA_PACKAGE "Snowball" #define DEFAULT_ADA_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/ada" #define DEFAULT_CS_NAMESPACE "Snowball" #define DEFAULT_CS_BASE_CLASS "Stemmer" #define DEFAULT_CS_AMONG_CLASS "Among" #define DEFAULT_CS_STRING_CLASS "StringBuilder" #define DEFAULT_CPLUSPLUS_NAMESPACE "Snowball" #define DEFAULT_CPLUSPLUS_BASE_CLASS "Stemmer" #define DEFAULT_JS_BASE_CLASS "B" #define DEFAULT_PYTHON_BASE_CLASS "BaseStemmer" static int eq(const char * s1, const char * s2) { return strcmp(s1, s2) == 0; } static void print_arglist(int exit_code) { FILE * f = exit_code ? stderr : stdout; fprintf(f, "Usage: snowball SOURCE_FILE... [OPTIONS]\n\n" "Supported options:\n" " -o, -output OUTPUT_BASE\n" " -s, -syntax show syntax tree and stop\n" " -comments generate comments\n" " -coverage generate coverage report\n" " -ada generate Ada\n" " -c++ generate C++\n" " -cs, -csharp generate C#\n" " -dart generate Dart\n" " -go generate Go\n" " -j, -java generate Java\n" " -js generate Javascript\n" " -pascal generate Pascal\n" " -php generate PHP\n" " -py, -python generate Python\n" " -rust generate Rust\n" " -zig generate Zig\n" " -w, -widechars\n" " -u, -utf8\n" " -n, -name CLASS_NAME\n" " -ep, -eprefix EXTERNAL_PREFIX\n" " -vp, -vprefix VARIABLE_PREFIX\n" " -i, -include DIRECTORY\n" " -r, -runtime DIRECTORY\n" " -cheader header name to include from C/C++ file\n" " -hheader header name to include from C/C++ header\n" " -p, -parentclassname CLASS_NAME fully qualified parent class name\n" " -P, -Package PACKAGE_NAME package name for stemmers\n" " -S, -Stringclass STRING_CLASS StringBuffer-compatible class\n" " -a, -amongclass AMONG_CLASS fully qualified name of the Among class\n" " -gor, -goruntime PACKAGE_NAME Go snowball runtime package\n" " --help display this help and exit\n" " --version output version information and exit\n" ); exit(exit_code); } static void check_lim(int i, int argc) { if (i >= argc) { fprintf(stderr, "argument list is one short\n"); print_arglist(1); } } static FILE * get_output(byte * s) { s[SIZE(s)] = 0; const char * filename = (const char *)s; FILE * output = fopen(filename, "w"); if (output == NULL) { fprintf(stderr, "Can't open output %s\n", filename); exit(1); } return output; } static struct options * read_options(int * argc_ptr, char * argv[]) { int argc = *argc_ptr; int i = 1; int new_argc = 1; /* Note down the last option used to specify an explicit encoding so * we can warn we ignored it for languages with a fixed encoding. */ const char * encoding_opt = NULL; NEW(options, o); *o = (struct options){0}; // Set defaults which differ from empty initialisation. o->target_lang = LANG_C; o->encoding = ENC_SINGLEBYTE; /* read options: */ while (i < argc) { char * s = argv[i++]; if (s[0] != '-' || s[1] == '\0') { /* Non-option argument - shuffle down. */ argv[new_argc++] = s; continue; } { if (eq(s, "-o") || eq(s, "-output")) { check_lim(i, argc); o->output_file = create_s_from_sz(argv[i++]); continue; } if (eq(s, "-n") || eq(s, "-name")) { check_lim(i, argc); o->name = create_s_from_sz(argv[i++]); continue; } if (eq(s, "-js")) { o->target_lang = LANG_JAVASCRIPT; continue; } if (eq(s, "-php")) { o->target_lang = LANG_PHP; continue; } if (eq(s, "-rust")) { o->target_lang = LANG_RUST; continue; } if (eq(s, "-zig")) { o->target_lang = LANG_ZIG; continue; } if (eq(s, "-go")) { o->target_lang = LANG_GO; continue; } if (eq(s, "-j") || eq(s, "-java")) { o->target_lang = LANG_JAVA; continue; } if (eq(s, "-dart")) { o->target_lang = LANG_DART; continue; } if (eq(s, "-cs") || eq(s, "-csharp")) { o->target_lang = LANG_CSHARP; continue; } if (eq(s, "-c++")) { o->target_lang = LANG_CPLUSPLUS; continue; } if (eq(s, "-pascal")) { o->target_lang = LANG_PASCAL; continue; } if (eq(s, "-py") || eq(s, "-python")) { o->target_lang = LANG_PYTHON; continue; } if (eq(s, "-ada")) { o->target_lang = LANG_ADA; continue; } if (eq(s, "-w") || eq(s, "-widechars")) { encoding_opt = s; o->encoding = ENC_WIDECHARS; continue; } if (eq(s, "-s") || eq(s, "-syntax")) { o->syntax_tree = true; continue; } if (eq(s, "-comments")) { o->comments = true; continue; } if (eq(s, "-coverage")) { o->coverage = true; continue; } if (eq(s, "-ep") || eq(s, "-eprefix")) { check_lim(i, argc); o->externals_prefix = argv[i++]; continue; } if (eq(s, "-vp") || eq(s, "-vprefix")) { check_lim(i, argc); o->variables_prefix = argv[i++]; continue; } if (eq(s, "-cheader")) { check_lim(i, argc); o->cheader = argv[i++]; continue; } if (eq(s, "-hheader")) { check_lim(i, argc); o->hheader = argv[i++]; continue; } if (eq(s, "-i") || eq(s, "-include")) { check_lim(i, argc); { NEW(include, p); *p = (struct include){0}; byte * include_dir = add_sz_to_s(NULL, argv[i++]); include_dir = add_char_to_s(include_dir, '/'); p->s = include_dir; if (o->includes == NULL) { o->includes = p; } else { o->includes_end->next = p; } o->includes_end = p; } continue; } if (eq(s, "-r") || eq(s, "-runtime")) { check_lim(i, argc); o->runtime_path = argv[i++]; continue; } if (eq(s, "-u") || eq(s, "-utf8")) { encoding_opt = s; o->encoding = ENC_UTF8; continue; } if (eq(s, "-p") || eq(s, "-parentclassname")) { check_lim(i, argc); o->parent_class_name = argv[i++]; continue; } if (eq(s, "-P") || eq(s, "-Package")) { check_lim(i, argc); o->package = argv[i++]; continue; } if (eq(s, "-S") || eq(s, "-stringclass")) { check_lim(i, argc); o->string_class = argv[i++]; continue; } if (eq(s, "-a") || eq(s, "-amongclass")) { check_lim(i, argc); o->among_class = argv[i++]; continue; } if (eq(s, "-gor") || eq(s, "-goruntime")) { check_lim(i, argc); o->go_snowball_runtime = argv[i++]; continue; } if (eq(s, "--help")) { print_arglist(0); } if (eq(s, "--version")) { printf("Snowball compiler version " SNOWBALL_VERSION "\n"); exit(0); } fprintf(stderr, "'%s' misplaced\n", s); print_arglist(1); } } if (new_argc == 1) { fprintf(stderr, "no source files specified\n"); print_arglist(1); } argv[new_argc] = NULL; /* Set language-dependent defaults. */ switch (o->target_lang) { case LANG_C: encoding_opt = NULL; break; case LANG_CPLUSPLUS: encoding_opt = NULL; if (!o->parent_class_name) o->parent_class_name = DEFAULT_CPLUSPLUS_BASE_CLASS; if (!o->package) o->package = DEFAULT_CPLUSPLUS_NAMESPACE; break; case LANG_CSHARP: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_CS_BASE_CLASS; if (!o->string_class) o->string_class = DEFAULT_CS_STRING_CLASS; if (!o->among_class) o->among_class = DEFAULT_CS_AMONG_CLASS; if (!o->package) o->package = DEFAULT_CS_NAMESPACE; break; case LANG_GO: o->encoding = ENC_UTF8; if (!o->package) o->package = DEFAULT_GO_PACKAGE; if (!o->go_snowball_runtime) o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME; break; case LANG_ADA: o->encoding = ENC_UTF8; if (!o->package) o->package = DEFAULT_ADA_PACKAGE; break; case LANG_JAVA: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_JAVA_BASE_CLASS; if (!o->string_class) o->string_class = DEFAULT_JAVA_STRING_CLASS; if (!o->among_class) o->among_class = DEFAULT_JAVA_AMONG_CLASS; if (!o->package) o->package = DEFAULT_JAVA_PACKAGE; break; case LANG_DART: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_DART_BASE_CLASS; break; case LANG_JAVASCRIPT: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_JS_BASE_CLASS; break; case LANG_PHP: o->encoding = ENC_UTF8; break; case LANG_PYTHON: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_PYTHON_BASE_CLASS; break; case LANG_RUST: o->encoding = ENC_UTF8; break; case LANG_ZIG: o->encoding = ENC_UTF8; break; default: break; } if (encoding_opt) { fprintf(stderr, "warning: %s only meaningful for C and C++\n", encoding_opt); } if (o->target_lang != LANG_C && o->target_lang != LANG_CPLUSPLUS) { if (o->runtime_path) { fprintf(stderr, "warning: -r/-runtime only meaningful for C and C++\n"); } if (o->variables_prefix) { fprintf(stderr, "warning: -vp/-vprefix only meaningful for C and C++\n"); } if (o->coverage) { fprintf(stderr, "warning: -coverage only currently supported for C and C++\n"); } } // Split any extension off o->output_file and set o->output_leaf to just // its leafname (which e.g. is used to generate `#include "english.h"` in // path/to/english.c). if (!o->output_file) { // Default output uses the basename from the first Snowball source. // E.g. algorithms/english.sbl -> english const char * first_source = argv[1]; const char * slash = strrchr(first_source, '/'); const char * leaf = (slash == NULL) ? first_source : slash + 1; slash = strrchr(leaf, '\\'); if (slash != NULL) leaf = slash + 1; const char * dot = strrchr(leaf, '.'); if (dot) { o->output_file = create_s_from_data(leaf, dot - leaf); } else { o->output_file = create_s_from_sz(leaf); } o->output_leaf = copy_s(o->output_file); } else { // Remove any extension from o->output_file so `-o path/to/english.c` // works. o->output_file[SIZE(o->output_file)] = '\0'; const char * output_file = (const char *)o->output_file; const char * slash = strrchr(output_file, '/'); const char * leaf = (slash == NULL) ? output_file : slash + 1; slash = strrchr(leaf, '\\'); if (slash != NULL) leaf = slash + 1; const char * dot = strrchr(leaf, '.'); if (dot) { o->extension = create_s_from_sz(dot); SET_SIZE(o->output_file, dot - output_file); o->output_leaf = create_s_from_data(leaf, dot - leaf); } else { o->output_leaf = create_s_from_sz(leaf); } } if (!o->name) { o->name = copy_s(o->output_leaf); const byte * dot = memchr(o->name, '.', SIZE(o->name)); if (dot) { // Trim off any extension (we only remove the last of multiple // extensions above). SET_SIZE(o->name, dot - o->name); } switch (o->target_lang) { case LANG_CSHARP: case LANG_PASCAL: /* Upper case initial letter. */ o->name[0] = toupper(o->name[0]); break; case LANG_CPLUSPLUS: case LANG_PHP: case LANG_PYTHON: { /* Upper case initial letter and change each * underscore+letter or hyphen+letter to an upper case * letter. */ size_t len = SIZE(o->name); size_t new_len = 0; bool uc_next = true; for (size_t j = 0; j != len; ++j) { byte ch = o->name[j]; if (ch == '_' || ch == '-') { uc_next = true; } else { if (uc_next) { o->name[new_len] = toupper(ch); uc_next = false; } else { o->name[new_len] = ch; } ++new_len; } } SET_SIZE(o->name, new_len); break; } default: /* Just use as-is, e.g. that's the Java convention. */ break; } } *argc_ptr = new_argc; return o; } extern int main(int argc, char * argv[]) { struct options * o = read_options(&argc, argv); char * file = argv[1]; byte * u = get_input(file); if (u == NULL) { fprintf(stderr, "Can't open input %s\n", file); exit(1); } struct tokeniser * t = create_tokeniser(u, file); struct analyser * a = create_analyser(t); struct input ** next_input_ptr = &(t->next); unsigned localise_mask = 0; a->encoding = t->encoding = o->encoding; t->includes = o->includes; /* If multiple source files are specified, set up the others to be * read after the first in order, using the same mechanism as * 'get' uses. */ for (int i = 2; i != argc; ++i) { NEW(input, q); *q = (struct input){0}; file = argv[i]; u = get_input(file); if (u == NULL) { fprintf(stderr, "Can't open input %s\n", file); exit(1); } q->p = u; q->file = file; q->line_number = 1; *next_input_ptr = q; next_input_ptr = &(q->next); } *next_input_ptr = NULL; /* Whether it's helpful to try to localise string variables varies * greatly between target languages. One reason for this is likely * to be that strings are immutable in some languages (e.g. Dart, * Javascript, Python) so each string operation creates a new * string anyway. * * We've attempted to benchmark most languages to decide. * * One potential gotcha here is for garbage collected languages, * where our benchmark might not trigger GC and in that case our * timing is missing the cost of that, which any long running * indexing process will eventually incur. * * We've mostly used the following artificial benchmark which * exercises a local string variable to test this: * * strings ( s ) * routines ( r ) * externals ( stem ) * define r as (-> s s) * define stem as ( next [tolimit] loop 100000000 do r ) * * Replace e.g. english.sbl with this and build the stemwords * equivalent for the target language, then: * * $ echo nonalphabetisations|time ./stemwords * * The appropriate number of iterations to use varies, and is * annotated below. */ switch (o->target_lang) { case LANG_ADA: // 1000000000: local 13.7s vs global 5.2s case LANG_C: // We lack a way to generate lose_s(v) on every `return` // from the function, but manually adjusting the generated // code to do this gives: // // 1000000000: local 44.9s vs global 6.3s case LANG_CPLUSPLUS: // String variables are handled the same as LANG_C. case LANG_CSHARP: // 100000000: local 18.8s vs global 12.4s case LANG_JAVA: // 1000000000: local 10.1s vs global 7.1s case LANG_RUST: // 1000000000: localising was slightly slower. case LANG_ZIG: // 10000000: localising strings was slightly slower. localise_mask = (1 << t_boolean) | (1 << t_integer); break; case LANG_DART: // Not timed, but strings are immutable so seems likely // to be helpful to localise. case LANG_GO: // 1000000000: localising was about 10% faster. case LANG_JAVASCRIPT: // 10000000: Slightly faster. case LANG_PASCAL: // Slightly faster. case LANG_PHP: // Slightly faster. case LANG_PYTHON: // 10000000: local 7.6s vs global 7.9s. Microbenchmarking // with timeit alligns with this. localise_mask = (1 << t_boolean) | (1 << t_integer) | (1 << t_string); break; } read_program(a, localise_mask); if (t->error_count > 0) exit(1); if (o->syntax_tree) print_program(a); if (!o->syntax_tree) { struct generator * g = create_generator(a, o); switch (o->target_lang) { case LANG_C: case LANG_CPLUSPLUS: { byte * s = copy_s(o->output_file); s = add_literal_to_s(s, ".h"); o->output_h = get_output(s); SET_SIZE(s, SIZE(o->output_file)); if (o->extension && !(SIZE(o->extension) == 2 && memcmp(o->extension, ".h", 2) == 0)) { s = add_s_to_s(s, o->extension); } else if (o->target_lang == LANG_CPLUSPLUS) { s = add_literal_to_s(s, ".cc"); } else { s = add_literal_to_s(s, ".c"); } o->output_src = get_output(s); lose_s(s); generate_program_c(g); fclose(o->output_src); fclose(o->output_h); break; } #ifndef TARGET_C_ONLY case LANG_ADA: { byte * s = copy_s(o->output_file); s = add_literal_to_s(s, ".ads"); o->output_h = get_output(s); SET_SIZE(s, SIZE(o->output_file)); if (o->extension && !(SIZE(o->extension) == 4 && memcmp(o->extension, ".ads", 2) == 0)) { s = add_s_to_s(s, o->extension); s = add_s_to_s(s, o->extension); } else { s = add_literal_to_s(s, ".adb"); } o->output_src = get_output(s); lose_s(s); generate_program_ada(g); fclose(o->output_src); fclose(o->output_h); break; } case LANG_CSHARP: { byte * s = copy_s(o->output_file); if (o->extension) { s = add_s_to_s(s, o->extension); } else { s = add_literal_to_s(s, ".cs"); } o->output_src = get_output(s); lose_s(s); generate_program_csharp(g); fclose(o->output_src); break; } case LANG_DART: { byte * s = copy_s(o->output_file); if (o->extension) { s = add_s_to_s(s, o->extension); } else { s = add_literal_to_s(s, ".dart"); } o->output_src = get_output(s); lose_s(s); generate_program_dart(g); fclose(o->output_src); break; } case LANG_GO: { byte * s = copy_s(o->output_file); if (o->extension) { s = add_s_to_s(s, o->extension); } else { s = add_literal_to_s(s, ".go"); } o->output_src = get_output(s); lose_s(s); generate_program_go(g); fclose(o->output_src); break; } case LANG_JAVA: { byte * s = copy_s(o->output_file); if (o->extension) { s = add_s_to_s(s, o->extension); } else { s = add_literal_to_s(s, ".java"); } o->output_src = get_output(s); lose_s(s); generate_program_java(g); fclose(o->output_src); break; } case LANG_JAVASCRIPT: { byte * s = copy_s(o->output_file); if (o->extension) { s = add_s_to_s(s, o->extension); } else { s = add_literal_to_s(s, ".js"); } o->output_src = get_output(s); lose_s(s); generate_program_js(g); fclose(o->output_src); break; } case LANG_PASCAL: { byte * s = copy_s(o->output_file); if (o->extension) { s = add_s_to_s(s, o->extension); } else { s = add_literal_to_s(s, ".pas"); } o->output_src = get_output(s); lose_s(s); generate_program_pascal(g); fclose(o->output_src); break; } case LANG_PHP: { byte * s = copy_s(o->output_file); if (o->extension) { s = add_s_to_s(s, o->extension); } else { s = add_literal_to_s(s, ".php"); } o->output_src = get_output(s); lose_s(s); generate_program_php(g); fclose(o->output_src); break; } case LANG_PYTHON: { byte * s = copy_s(o->output_file); if (o->extension) { s = add_s_to_s(s, o->extension); } else { s = add_literal_to_s(s, ".py"); } o->output_src = get_output(s); lose_s(s); generate_program_python(g); fclose(o->output_src); break; } case LANG_RUST: { byte * s = copy_s(o->output_file); if (o->extension) { s = add_s_to_s(s, o->extension); } else { s = add_literal_to_s(s, ".rs"); } o->output_src = get_output(s); lose_s(s); generate_program_rust(g); fclose(o->output_src); break; } case LANG_ZIG: { byte * s = copy_s(o->output_file); if (o->extension) { s = add_s_to_s(s, o->extension); } else { s = add_literal_to_s(s, ".zig"); } o->output_src = get_output(s); lose_s(s); generate_program_zig(g); fclose(o->output_src); break; } #else default: fprintf(stderr, "Support for requested target language not enabled\n"); exit(1); #endif } close_generator(g); } close_tokeniser(t); close_analyser(a); lose_s(u); struct include * p = o->includes; while (p) { struct include * q = p->next; lose_s(p->s); FREE(p); p = q; } lose_s(o->extension); lose_s(o->name); lose_s(o->output_file); lose_s(o->output_leaf); FREE(o); if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count); return 0; } snowball-3.1.0/compiler/generator.c000066400000000000000000000444221520373054300173100ustar00rootroot00000000000000#include #include /* for fprintf etc */ #include /* for free etc */ #include /* for strlen */ #include "header.h" /* Generator functions common to multiple target languages. */ extern struct generator * create_generator(struct analyser * a, struct options * o) { NEW(generator, g); *g = (struct generator){0}; g->analyser = a; g->options = o; g->failure_label = -1; g->varname_prefix = "v_"; g->margin_indent = " "; return g; } extern void close_generator(struct generator * g) { FREE(g); } extern int just_return_on_fail(struct generator * g) { return g->failure_label == x_return && str_len(g->failure_str) == 0; } extern int tailcallable(struct generator * g, struct node * p) { return just_return_on_fail(g) && p->right && p->right->type == c_functionend; } // Write a C-style relational operator (also used by some other languages). extern void write_c_relop(struct generator * g, int relop) { switch (relop) { case c_eq: write_string(g, " == "); break; case c_ne: write_string(g, " != "); break; case c_gt: write_string(g, " > "); break; case c_ge: write_string(g, " >= "); break; case c_lt: write_string(g, " < "); break; case c_le: write_string(g, " <= "); break; default: fprintf(stderr, "Unexpected type #%d in write_c_relop\n", relop); exit(1); } } static void write_comment_literalstring(struct generator * g, const symbol *s, const char * end) { if (end) { // Check if the literal string contains the target language end comment // string. Don't try to be clever here as real-world literal strings // are unlikely to contain even partial matches. int end_len = strlen(end); if (end_len <= SIZE(s)) { for (int i = 0; i <= SIZE(s) - end_len; ++i) { for (int j = 0; j < end_len; ++j) { if (s[i + j] != end[j]) goto next_outer; } write_string(g, ""); return; next_outer: ; } } } symbol ch_max = 0xa0; if (g->options->encoding == ENC_SINGLEBYTE) { ch_max = 0xff; } int i = 0; write_char(g, '\''); while (i < SIZE(s)) { int ch; if (g->options->encoding == ENC_UTF8) { i += get_utf8(s + i, &ch); } else { ch = s[i++]; } if (ch == '\'' || ch == '{') { write_char(g, '{'); write_char(g, ch); write_char(g, '}'); } else if (ch < 32 || (ch >= 127 && ch <= ch_max) || ch == '\\' || ch >= 0x590) { // Encode characters which are problematic if emitted literally // using Snowball-style `{U+xx}`: // // * Control characters. // // * For ENC_SINGLEBYTE we encode all non-ASCII to avoid invalid // UTF-8 in comments (which clang warns about for C/C++ with // option `-pedantic` or `-Winvalid-utf8`). // // * `\`: In Java, `\u000a` in a comment is interpreted as a // newline and so exits the comment, while `\uq` gives // compilation error `illegal unicode escape`. Since `\` is // unusual in Snowball literal strings we take the simple // approach of escaping it for all target languages. // // * Anything >= 0x590 as a crude way to avoid LTR characters // affecting the rendering of source character order in confusing // ways. write_string(g, "{U+"); write_hex(g, ch); write_char(g, '}'); } else { write_wchar_as_utf8(g, ch); } } write_char(g, '\''); } static void write_comment_AE(struct generator * g, struct node * p) { switch (p->type) { case c_name: write_s(g, p->name->s); break; case c_number: write_int(g, p->number); break; case c_cursor: case c_len: case c_lenof: case c_limit: case c_maxint: case c_minint: case c_size: case c_sizeof: write_string(g, name_of_token(p->type)); if (p->name) { write_char(g, ' '); write_s(g, p->name->s); } break; case c_neg: write_char(g, '-'); write_comment_AE(g, p->right); break; case c_multiply: case c_plus: case c_minus: case c_divide: write_char(g, '('); write_comment_AE(g, p->left); write_char(g, ' '); write_string(g, name_of_token(p->type)); write_char(g, ' '); write_comment_AE(g, p->right); write_char(g, ')'); break; default: fprintf(stderr, "Unexpected type #%d in write_comment_AE\n", p->type); exit(1); } } void write_comment_content(struct generator * g, struct node * p, const char * end) { switch (p->type) { case c_assign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: if (p->name) { write_char(g, '$'); write_s(g, p->name->s); write_char(g, ' '); } write_string(g, name_of_token(p->type)); write_char(g, ' '); write_comment_AE(g, p->AE); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: write_string(g, "$("); write_comment_AE(g, p->left); write_char(g, ' '); write_string(g, name_of_token(p->type)); write_char(g, ' '); write_comment_AE(g, p->AE); write_char(g, ')'); break; case c_define: if (p->mode == m_forward) { write_string(g, "forwardmode define "); } else { write_string(g, "backwardmode define "); } write_s(g, p->name->s); break; case c_literalstring: write_comment_literalstring(g, p->literalstring, end); break; case c_call: case c_grouping: case c_name: write_s(g, p->name->s); break; default: write_string(g, name_of_token(p->type)); if (p->name) { write_char(g, ' '); write_s(g, p->name->s); } else if (p->literalstring) { write_char(g, ' '); write_comment_literalstring(g, p->literalstring, end); } } write_string(g, ", line "); write_int(g, p->line_number); } void write_generated_comment_content(struct generator * g) { // Report only the leafname of the Snowball source file to make output // reproducible even if an absolute path to the source file is specified. write_string(g, "Generated from "); const char * leaf = g->analyser->tokeniser->file; const char * p = strrchr(leaf, '/'); if (p) leaf = p + 1; p = strrchr(leaf, '\\'); if (p) leaf = p + 1; write_string(g, leaf); write_string(g, " by Snowball " SNOWBALL_VERSION " - https://snowballstem.org/"); } void write_start_comment(struct generator * g, const char * comment_start, const char * comment_end) { write_margin(g); write_string(g, comment_start); write_generated_comment_content(g); if (comment_end) { write_string(g, comment_end); } write_newline(g); write_newline(g); } extern struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, g->varname_prefix); str_append_int(output, g->var_number); return output; } extern void write_margin(struct generator * g) { for (int i = 0; i < g->margin; i++) write_string(g, g->margin_indent); } /* K_needed() tests to see if we really need to keep c. Not true when the command does not touch the cursor (and in backwardmode, also does not change the limit by inserting, deleting, or replacing text in the string). This and repeat_score() could be elaborated almost indefinitely. */ static int K_needed_(struct node * p, int call_depth); static int K_needed_node(struct node * p, int call_depth) { switch (p->type) { case c_assignto: case c_do: case c_dollar: case c_leftslice: case c_rightslice: case c_assign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: case c_sliceto: case c_booltest: case c_not_booltest: case c_set: case c_unset: case c_true: case c_false: case c_debug: case c_functionend: // Doesn't change the cursor or always restores it. break; case c_stringassign: // Doesn't change the cursor in forwards mode; in backwards // mode the cursor and forwards limit move in step. break; case c_attach: // Cursor modified in backwardmode. if (p->mode == m_backward) return true; break; case c_insert: // Cursor modified in forwards mode. if (p->mode == m_forward) return true; break; case c_call: /* Recursive functions aren't typical in snowball programs, so * make the pessimistic assumption that keep is needed if we * hit a generous limit on recursion. It's not likely to make * a difference to any real world program, but means we won't * recurse until we run out of stack for pathological cases. */ if (call_depth >= 100) return true; if (K_needed_(p->name->definition->left, call_depth + 1)) return true; break; case c_bra: case c_loop: case c_fail: if (K_needed_(p->left, call_depth)) return true; break; case c_backwards: case c_reverse: case c_test: if (p->possible_signals != 1) return true; // Restores cursor on t and the subcommand can't fail. break; default: return true; } return false; } static int K_needed_(struct node * p, int call_depth) { while (p) { if (K_needed_node(p, call_depth)) return true; p = p->right; } return false; } extern int K_needed(struct node * p) { return K_needed_(p, 0); } static int K_needed_node_on_f_(struct node * p, int call_depth) { switch (p->type) { case c_assignto: case c_do: case c_dollar: case c_leftslice: case c_rightslice: case c_assign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: case c_sliceto: case c_booltest: case c_not_booltest: case c_set: case c_unset: case c_true: case c_false: case c_debug: case c_functionend: // Doesn't change the cursor or always restores it. break; case c_grouping: case c_literalstring: case c_name: case c_non: case c_hop: case c_next: case c_substring: case c_tomark: // Doesn't modify the cursor on failure. break; case c_delete: case c_repeat: case c_slicefrom: case c_tolimit: case c_attach: case c_insert: // Can't fail, so can't modify the cursor on failure. break; case c_goto: case c_try: // Restores the cursor on failure. break; case c_gopast: // Restores the cursor on failure if repeat_restore() is true. if (!repeat_restore(p->left)) return true; break; case c_call: /* Recursive functions aren't typical in snowball programs, so * make the pessimistic assumption that keep is needed if we * hit a generous limit on recursion. It's not likely to make * a difference to any real world program, but means we won't * recurse until we run out of stack for pathological cases. */ if (call_depth >= 100) return true; if (K_needed_(p->name->definition->left, call_depth + 1)) return true; break; case c_bra: case c_loop: case c_fail: if (K_needed_(p->left, call_depth)) return true; break; case c_backwards: case c_reverse: case c_test: if (p->possible_signals != 1) return true; // Restores cursor on t and the subcommand can't fail. break; default: // FIXME: Can we handle c_or c_and c_among c_atleast c_setlimit // better? return true; } return false; } extern int K_needed_node_on_f(struct node * p) { return K_needed_node_on_f_(p, 0); } // Like K_needed(), but for the sub-node chain of c_or. We only restore on // signal f, and the cursor only needs to be restored between nodes so we don't // need to check the final node in the chain. extern int K_needed_for_or(struct node * p) { while (p->right) { if (K_needed_node_on_f(p)) return true; p = p->right; } return false; } // Like K_needed(), but for the sub-node chain of c_and. The cursor only needs // to be restored between nodes so we don't need to check the final node in the // chain. extern int K_needed_for_and(struct node * p) { while (p->right) { if (K_needed_node(p, 0)) return true; p = p->right; } return false; } static int repeat_score(struct node * p, int call_depth) { int score = 0; while (p) { switch (p->type) { case c_dollar: case c_leftslice: case c_rightslice: case c_assign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: case c_sliceto: /* case c_not: must not be included here! */ case c_booltest: case c_not_booltest: case c_set: case c_unset: case c_true: case c_false: case c_debug: case c_functionend: break; case c_call: /* Recursive functions aren't typical in snowball programs, so * make the pessimistic assumption that repeat requires cursor * reinstatement if we hit a generous limit on recursion. It's * not likely to make a difference to any real world program, * but means we won't recurse until we run out of stack for * pathological cases. */ if (call_depth >= 100) { return 2; } score += repeat_score(p->name->definition->left, call_depth + 1); if (score >= 2) return score; break; case c_bra: score += repeat_score(p->left, call_depth); if (score >= 2) return score; break; case c_name: case c_literalstring: case c_next: case c_grouping: case c_non: #if 0 // These could be here if the target-language helpers all preserved // the cursor on failure: case c_goto_grouping: case c_gopast_grouping: case c_goto_non: case c_gopast_non: #endif case c_hop: if (++score >= 2) return score; break; default: return 2; } p = p->right; } return score; } /* tests if an expression requires cursor reinstatement in a repeat */ extern int repeat_restore(struct node * p) { return repeat_score(p, 0) >= 2; } /* Language-independent write routines for simple entities */ static void write_hexdigit(struct generator * g, unsigned i) { str_append_ch(g->outbuf, "0123456789ABCDEF"[i & 0xF]); /* hexchar */ } extern void write_hex4(struct generator * g, unsigned ch) { for (int i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i); } extern void write_hex(struct generator * g, unsigned i) { if (i >> 4) write_hex(g, i >> 4); write_hexdigit(g, i); /* hex integer */ } extern void write_char(struct generator * g, int ch) { str_append_ch(g->outbuf, ch); /* character */ } extern void write_newline(struct generator * g) { /* Avoid generating trailing whitespace. */ while (true) { int ch = str_back(g->outbuf); if (ch != ' ' && ch != '\t') break; str_pop(g->outbuf); } str_append_ch(g->outbuf, '\n'); /* newline */ g->line_count++; } extern void write_string(struct generator * g, const char * s) { str_append_string(g->outbuf, s); } extern void write_wchar_as_utf8(struct generator * g, symbol ch) { str_append_wchar_as_utf8(g->outbuf, ch); } extern void write_int(struct generator * g, int i) { str_append_int(g->outbuf, i); } // Write an integer, left-padded to width 3 with spaces. extern void wi3(struct generator * g, int i) { if (i < 100) write_char(g, ' '); if (i < 10) write_char(g, ' '); write_int(g, i); } extern void write_s(struct generator * g, const byte * s) { str_append_s(g->outbuf, s); } extern void write_str(struct generator * g, struct str * str) { str_append(g->outbuf, str); } snowball-3.1.0/compiler/generator_ada.c000066400000000000000000001614561520373054300201240ustar00rootroot00000000000000#include #include #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { // Relational operators are the same as C, save for (in-)equality. switch (relop) { case c_eq: write_string(g, " = "); break; case c_ne: write_string(g, " /= "); break; default: write_c_relop(g, relop); } } static void write_varname(struct generator * g, struct name * p) { int ends_in_underscore = (p->s[SIZE(p->s) - 1] == '_'); if (p->type == t_external) { if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } } else { /* Ada identifiers are case-insensitive but Snowball identifiers * should be case-sensitive. To address this, if any groups of * identifiers of the same type have the same case, we insert a counter * after the type-code for all but one of them. This count is unique * within each type of variable so this avoid collisions while being * minimally intrusive on the readability of the generated code. * * Ada also doesn't allow identifiers to end in an underscore, * so we append a `E` to such identifiers. To avoid potential * collisions from doing so, we also insert the counter after * the type-code in this case (which may not be necessary, but this * case seems more of a theoretical concern, whereas identifiers * differing only by case has been seen in real-world Snowball code). * * So for example (noting that we upper-case the first character of * the Snowball name below): * * Snowball integer `i` -> Ada `I_I` * Snowball integer `I` -> Ada `I2_I` * Snowball integer `i_` -> Ada `I3_I_E` * Snowball integer `i_e` -> Ada `I_I_e` * * We don't try to solve this problem for external identifiers - it * seems more helpful to leave those alone and encourage snowball * program authors to avoid naming externals which only differ by * case. * * We use the same naming scheme for both global and local variables. */ write_char(g, "SBIRXG"[p->type]); if (ends_in_underscore || p->case_collision) { write_int(g, p->count); } write_char(g, '_'); } { char save_initial = p->s[0]; p->s[0] = toupper(save_initial); str_append_s(g->outbuf, p->s); p->s[0] = save_initial; } if (ends_in_underscore) write_char(g, 'E'); } /* Reference to variable, e.g. when assigning to or using in an expression. */ static void write_varref(struct generator * g, struct name * p) { if (p->type < t_routine && p->local_to == NULL) { write_string(g, "Z."); } write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { if (SIZE(p) == 0) { write_string(g, "\"\""); return; } // Ada supports UTF-8 literal strings, we only need to escape the quote and // special characters. bool in_quotes = false; int i = 0; while (i < SIZE(p)) { int ch; int w = get_utf8(p + i, &ch); // Write out ASCII and lower Unicode printables as literal characters. // Use escapes for anything over 0x590 as a crude way to avoid LTR // characters affecting the rendering of source character order in // confusing ways. if ((32 <= ch && ch < 127) || (0xa0 < ch && ch < 0x590)) { if (!in_quotes) { if (i > 0) { write_string(g, " & "); } write_char(g, '"'); in_quotes = true; } if (ch == '"') write_char(g, '\\'); write_wchar_as_utf8(g, ch); } else { if (in_quotes) { write_char(g, '"'); in_quotes = false; } for (int j = i; j < i + w; ++j) { if (j > 0) { write_string(g, " & "); } write_string(g, "Character'Val("); write_int(g, (int)p[j]); write_string(g, ")"); } } i += w; } if (in_quotes) { write_char(g, '"'); } } /* Write a variable declaration. */ static void write_declare(struct generator * g, const char * declaration, struct node * p) { struct str * temp = g->outbuf; g->outbuf = g->declarations; write_string(g, " "); writef(g, declaration, p); write_char(g, ';'); write_newline(g); g->outbuf = temp; } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "-- "); write_comment_content(g, p, NULL); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~Mbegin~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~Mend;~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "Z.L - "; write_declare(g, "~B0 : Char_Index", p); writef(g, "~M~B0 := ~S1Z.C;~N" , p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "Z.C := "); if (p->mode != m_forward) str_append_string(out, "Z.L - "); str_append(out, savevar); str_append_ch(out, ';'); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void wsetl(struct generator * g, int n) { write_newline(g); write_margin(g); write_string(g, "<>"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "Result := False;"); write_newline(g); write_margin(g); write_string(g, "return;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_char(g, ';'); g->label_used = 1; } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, " then~N~+", p); write_failure(g); writef(g, "~-~Mend if;~N", p); g->unreachable = false; } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_failure(g); g->unreachable = false; continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': write_varref(g, p->name); continue; case 'W': write_varname(g, p->name); continue; case 'L': write_literal_string(g, p->literalstring); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_s(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static int need_among_var(struct node *p) { while (p) { if (p->type == c_among) { return 1; } if (p->left && need_among_var(p->left)) { return 1; } if (p->aux && need_among_var(p->aux)) { return 1; } p = p->right; } return 0; } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: // Avoid `parentheses required for unary minus` error from gnat. if (p->number < 0) write_char(g, '('); write_int(g, p->number); if (p->number < 0) write_char(g, ')'); break; case c_maxint: write_string(g, "Integer'Last"); break; case c_minint: write_string(g, "Integer'First"); break; case c_neg: write_string(g, "(-"); generate_AE(g, p->right); write_char(g, ')'); break; case c_multiply: s = " * "; goto label0; case c_divide: s = " / "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "Z.C"); break; case c_limit: w(g, p->mode == m_forward ? "Z.L" : "Z.Lb"); break; case c_len: w(g, "Length_Utf8 (Z.P, Z.Len)"); break; case c_size: w(g, "Z.Len"); break; case c_lenof: writef(g, "Length_Utf8 (~V, Z.L~W)", p); break; case c_sizeof: writef(g, "Z.L~W", p); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = true; write_comment(g, p); w(g, "~Mloop~N~+"); if (savevar && K_needed_node_on_f(p)) { write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; g->label_used = 0; generate(g, p); if (!g->unreachable) { w(g, "~Mexit;~N"); end_unreachable = false; } if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); } p = p->right; } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); if (!g->unreachable) { w(g, "~Mexit;~N"); } w(g, "~-~Mend loop;~N"); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~MZ.Lb := Z.C; Z.C := Z.L;~N", p); generate(g, p->left); w(g, "~MZ.C := Z.Lb;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); int l = g->failure_label; generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); if (g->label_used) wsetl(g, l); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V := True;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V := False;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); writef(g, "~M~V (Z, Result);~N", p->left); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) w(g, "~MC := Skip_Utf8 (Z);~N"); else w(g, "~MC := Skip_Utf8_Backward (Z);~N"); write_failure_if(g, "C < 0", p); w(g, "~MZ.C := C;~N"); g->temporary_used = true; } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->S[1] = complement ? "In" : "Out"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (is_goto) { writef(g, "~M~S1_Grouping~S0 (Z, ~V, ~I0, ~I1, True, C);~N", p); write_failure_if(g, "C < 0", p); } else { writef(g, "~M~S1_Grouping~S0 (Z, ~V, ~I0, ~I1, True, C);~N", p); write_failure_if(g, "C < 0", p); if (p->mode == m_forward) w(g, "~MZ.C := Z.C + C;~N"); else w(g, "~MZ.C := Z.C - C;~N"); } g->temporary_used = true; } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; bool end_unreachable = false; w(g, "~Mloop~N~+"); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); w(g, "~Mexit;~N"); } g->unreachable = false; if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->label_used = used; g->failure_label = a0; generate_next(g, p); w(g, "~-~Mend loop;~N"); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mfor ~B0 in reverse 1 .. "); generate_AE(g, p->AE); writef(g, " loop~N~+", p); generate(g, p->left); w(g, "~-~Mend loop;~N"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { int replab = new_label(g); wsetl(g, replab); // We only actually loop via `goto` to replab, but if we don't generate // a dummy `loop` ... `exit; end loop;` structure then gnat emits: // // info: code between label and backwards goto rewritten as loop [enabled by default] writef(g, "~N~Mloop~N~+", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0 := ~B0 - 1;~N"); } g->I[0] = replab; w(g, "~Mgoto lab~I0;~N"); } if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~N~Mexit;~N~-~Mend loop;~N"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~{"); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); write_declare(g, "~B0 : Integer", p); w(g, "~M~B0 := "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif Z.C ~S0 "); generate_AE(g, p->AE); w(g, " then~N"); write_failure(g); w(g, "~Mend if;~N"); g->unreachable = false; w(g, "~MZ.C := "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_Backward"; w(g, "~MC := Skip_Utf8~S0 (Z, "); generate_AE(g, p->AE); writef(g, ");~N", p); write_failure_if(g, "C < 0", p); writef(g, "~MZ.C := C;~N", p); g->temporary_used = true; } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~MSlice_Del (Z);~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "b"; writef(g, "~MZ.C := Z.L~S0;~N", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "Bra" : "Ket"; writef(g, "~MZ.~S0 := Z.C;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "Ket" : "Bra"; writef(g, "~MZ.~S0 := Z.C;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V := Assign_To (Z, ~V);~N", p); write_failure_if(g, "~V == 0", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~MZ.L~W := Z.Ket - Z.Bra;~N", p); writef(g, "~M~V (1 .. Z.L~W) := Z.P (Z.Bra + 1 .. Z.Ket);~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); write_string(g, ", "); write_int(g, SIZE(b)); } else { write_varref(g, p->name); write_string(g, ", Z.L"); write_varname(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~MC := Z.C;~N"); writef(g, "~MInsert (Z, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) { w(g, "~MZ.C := C;~N"); g->temporary_used = true; } } static void generate_stringassign(struct generator * g, struct node * p) { write_comment(g, p); int keep_c = p->mode == m_forward; /* like 'attach' */ if (keep_c) { writef(g, "~MC := Z.C;~N", p); g->temporary_used = true; } if (p->mode == m_forward) { /* like 'attach' */ writef(g, "~MReplace (Z, Z.C, Z.L, ", p); } else { writef(g, "~MReplace (Z, Z.Lb, Z.C, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) { w(g, "~MZ.C := C;~N"); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~MSlice_From (Z, "); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * varname = vars_newname(g); g->B[0] = str_data(varname); write_declare(g, "~B0 : Integer", p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif Z.C ~S0 "); generate_AE(g, q->AE); writef(g, " then~N~+", q); write_failure(g); w(g, "~-~Mend if;~N"); g->unreachable = false; if (p->mode == m_forward) { w(g, "~M~B0 := Z.L; Z.L := "); generate_AE(g, q->AE); w(g, "; ~B0 := ~B0 - Z.L;~N"); } else { w(g, "~M~B0 := Z.Lb; Z.Lb := "); generate_AE(g, q->AE); w(g, ";~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "Z.L := Z.L + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "Z.Lb := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~M~B0 := Z.L - Z.C;~N"); w(g, "~MZ.L := Z.C;~N"); } else { w(g, "~M~B0 := Z.Lb;~N"); w(g, "~MZ.Lb := Z.C;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "Z.L := Z.L + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "Z.Lb := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); write_declare(g, "~B0_C : Char_Index", p); write_declare(g, "~B0_Len : Char_Index", p); write_declare(g, "~B0_L : Char_Index", p); write_declare(g, "~B0_Lb : Char_Index", p); write_declare(g, "~B0_Bra : Char_Index", p); write_declare(g, "~B0_Ket : Char_Index", p); write_declare(g, "~B0_P : String (1 .. WORD_MAX_LENGTH)", p); writef(g, "~{" "~M~B0_C := Z.C;~N" "~M~B0_Len := Z.Len;~N" "~M~B0_L := Z.L;~N" "~M~B0_Lb := Z.Lb;~N" "~M~B0_Bra := Z.Bra;~N" "~M~B0_Ket := Z.Ket;~N" "~M~B0_P := Z.P;~N" "~MZ.P := ~V;~N" "~MZ.Len := Z.L~W;~N" "~MZ.C := 0;~N" "~MZ.Bra := 0;~N" "~MZ.Ket := 0;~N" "~MZ.L := Z.Len;~N", p); if (p->left->possible_signals == -1) { /* Assume failure. */ write_declare(g, "~B0_F : Boolean", p); w(g, "~M~B0_F := True;~N"); } generate(g, p->left); if (!g->unreachable && p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_F := False;~N"); } if (g->label_used) wsetl(g, g->failure_label); g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); writef(g, "~M~V := Z.P;~N" "~MZ.L~W := Z.Len;~N" "~MZ.C := ~B0_C;~N" "~MZ.Len := ~B0_Len;~N" "~MZ.L := ~B0_L;~N" "~MZ.Lb := ~B0_Lb;~N" "~MZ.Bra := ~B0_Bra;~N" "~MZ.Ket := ~B0_Ket;~N" "~MZ.P := ~B0_P;~N", p); if (p->left->possible_signals == 0) { // p->left always signals f. w(g, "~M~f~N"); } else if (p->left->possible_signals == -1) { write_failure_if(g, "~B0_F", p); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); writef(g, "~M~V := ", p); if (s != NULL) { g->S[0] = s; writef(g, "~V ~S0 ", p); } generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~MResult := ("); p->right = NULL; } else { w(g, "~Mif "); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, ");~N"); } else { w(g, " then~+~N"); write_failure(g); w(g, "~-~Mend if;~N"); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~M~V (Z, Result);~N", p); w(g, "~Mreturn;~N"); p->right = NULL; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~M~V (Z, Result);~N", p); w(g, "~Mreturn~N"); return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V (Z, Result);~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V (Z, Result);~N", p); write_failure(g); } else { writef(g, "~M~V (Z, Result);~N", p); write_failure_if(g, "not Result", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->S[1] = complement ? "Out_" : "In_"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (tailcallable(g, p)) { writef(g, "~M~S1Grouping~S0 (Z, ~V, ~I0, ~I1, False, C);~N", p); w(g, "~MResult := (C = 0);~N"); p->right = NULL; } else { writef(g, "~M~S1Grouping~S0 (Z, ~V, ~I0, ~I1, False, C);~N", p); write_failure_if(g, "C /= 0", p); } g->temporary_used = true; } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_Backward"; if (tailcallable(g, p)) { writef(g, "~MEq_S~S0 (Z, ~V, Z.L~W, Result);~N", p); p->right = NULL; } else { writef(g, "~MEq_S~S0 (Z, ~V, Z.L~W, Result);~N", p); write_failure_if(g, "not Result", p); } } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->I[0] = SIZE(p->literalstring); if (tailcallable(g, p)) { writef(g, "~MEq_S~S0 (Z, ~L, ~I0, Result);~N", p); p->right = NULL; } else { writef(g, "~MEq_S~S0 (Z, ~L, ~I0, Result);~N", p); write_failure_if(g, "not Result", p); } } static void generate_define(struct generator * g, struct node * p) { write_newline(g); write_comment(g, p); /* Generate function header. */ writef(g, "~Mprocedure ~W (Z : in out Context_Type; Result : out Boolean) is~N", p); /* Save output. */ struct str * saved_output = g->outbuf; struct str * saved_declarations = g->declarations; g->outbuf = str_new(); g->declarations = str_new(); g->next_label = 0; g->var_number = 0; g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ w(g, "~{"); int signals = p->left->possible_signals; g->temporary_used = false; generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (signals) { generate(g, p->left->right); } } #define FINAL_RETURN "\n return;\n" #define FINAL_RETURN_LEN (sizeof(FINAL_RETURN) - 1) if (memcmp(str_data(g->outbuf) + str_len(g->outbuf) - FINAL_RETURN_LEN, FINAL_RETURN, FINAL_RETURN_LEN) == 0) { // If generate_functionend() has just added a return we remove it again. // This is really only a cosmetic improvement. str_pop_n(g->outbuf, FINAL_RETURN_LEN - 1); } writef(g, "~-~Mend ~W;~N", p); if (need_among_var(p->left)) { str_append_string(saved_output, " A : Integer;\n"); } if (g->temporary_used) { str_append_string(saved_output, " C : Result_Index;\n"); } /* Declare localised variables. */ struct str * temp = g->outbuf; g->outbuf = saved_output; for (struct name * name = g->analyser->names; name; name = name->next) { if (name->local_to == p->name) { switch (name->type) { case t_string: assert(0); break; case t_integer: w(g, " "); write_varname(g, name); w(g, " : Integer;\n"); break; case t_boolean: w(g, " "); write_varname(g, name); w(g, " : Boolean;\n"); break; } } } g->outbuf = temp; if (g->var_number) { str_append(saved_output, g->declarations); } str_append(saved_output, g->outbuf); str_delete(g->declarations); str_delete(g->outbuf); g->declarations = saved_declarations; g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~MResult := True;~N~Mreturn;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; int block = -1; unsigned int bitmap = 0; struct amongvec * among_cases = x->b; int empty_case = -1; int n_cases = 0; symbol cases[2]; int shortest_size = x->shortest_size; int block_opened = 0; g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->I[0] = x->number; /* In forward mode with non-ASCII UTF-8 characters, the first byte * of the string will often be the same, so instead look at the last * common byte position. * * In backward mode, we can't match if there are fewer characters before * the current position than the minimum length. */ for (int c = 0; c < x->literalstring_count; ++c) { symbol ch; if (among_cases[c].size == 0) { empty_case = c; continue; } if (p->mode == m_forward) { ch = among_cases[c].b[shortest_size - 1]; } else { ch = among_cases[c].b[among_cases[c].size - 1]; } if (n_cases == 0) { block = ch >> 5; } else if (ch >> 5 != block) { block = -1; if (n_cases > 2) break; } if (block == -1) { if (n_cases > 0 && ch == cases[0]) continue; if (n_cases < 2) { cases[n_cases++] = ch; } else if (ch != cases[1]) { ++n_cases; break; } } else { if ((bitmap & (1u << (ch & 0x1f))) == 0) { bitmap |= 1u << (ch & 0x1f); if (n_cases < 2) cases[n_cases] = ch; ++n_cases; } } } int pre_check = (block != -1 || n_cases <= 2); if (pre_check) { char buf[64]; char buf2[128]; char buf3[64]; g->I[2] = block; g->I[3] = bitmap; g->I[4] = shortest_size - 1; g->S[3] = buf3; checked_snprintf(buf3, sizeof(buf3), "16#%x#", bitmap); if (p->mode == m_forward) { if (shortest_size == 1) { checked_snprintf(buf, sizeof(buf), "Z.C"); } else { checked_snprintf(buf, sizeof(buf), "Z.C + %d", shortest_size - 1); } checked_snprintf(buf2, sizeof(buf2), "Character'Pos (Z.P (%s + 1))", buf); g->S[1] = buf; g->S[2] = buf2; if (shortest_size == 1) { writef(g, "~Mif Z.C >= Z.L", p); } else { writef(g, "~Mif Z.C + ~I4 >= Z.L", p); } } else { g->S[1] = "Z.C - 1"; g->S[2] = "Character'Pos (Z.P (Z.C))"; if (shortest_size == 1) { writef(g, "~Mif Z.C <= Z.Lb", p); } else { writef(g, "~Mif Z.C - ~I4 <= Z.Lb", p); } } assert(n_cases > 0); if (n_cases == 1) { g->I[4] = cases[0]; writef(g, " or else ~S2 /= ~I4", p); } else if (n_cases == 2) { g->I[4] = cases[0]; g->I[5] = cases[1]; writef(g, " or else (~S2 /= ~I4 and then ~S2 /= ~I5)", p); } else { writef(g, " or else Check_Among (Z, ~S1, ~I2, ~S3)", p); } writef(g, " then~+~N", p); if (empty_case != -1 && !among_cases[empty_case].function) { /* If the among includes the empty string, it can never fail * so not matching the bitmap means we match the empty string. */ g->I[4] = among_cases[empty_case].result; writef(g, "~MA := ~I4;~-~N~Melse~+~N", p); block_opened = 1; } else { writef(g, "~f", p); writef(g, "~-~Mend if;~N", p); } } else { #ifdef OPTIMISATION_WARNINGS printf("Couldn't shortcut among %d\n", x->number); #endif } g->S[1] = (x->function_count > 0) ? "Among_Handler'Access" : "null"; if (x->amongvar_needed) { writef(g, "~MFind_Among~S0 (Z, A_~I0, Among_String, ~S1, A);~N", p); if (!x->always_matches) { write_failure_if(g, "A = 0", p); } if (block_opened) writef(g, "~-~Mend if;~N", p); return; } if (pre_check && !x->function_count) { // If all cases are one symbol long (so one byte of UTF-8, one // character long in fixed-width encodings) then we don't need to call // the helper and can just inc/dec the cursor by 1. if (x->longest_size == 1 && !x->always_matches) { if (p->mode == m_forward) { w(g, "~MZ.C := Z.C + 1;~N"); } else { w(g, "~MZ.C := Z.C - 1;~N"); } // Suppress generating table for this among. x->used = false; return; } } writef(g, "~MFind_Among~S0 (Z, A_~I0, Among_String, ~S1, A);~N", p); if (x->always_matches) { // The result in `A` can't be zero. } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~MResult := A /= 0;~N", p); x->node->right = NULL; } else { write_failure_if(g, "A = 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mcase A is~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mwhen ~I0 =>~N"); g->margin++; generate(g, x->commands[i - 1]); g->margin--; g->unreachable = false; } w(g, "~Mwhen others =>~N"); w(g, "~M null;~N"); w(g, "~-~Mend case;~N"); } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~MResult := not ~V;~N", p); } else { writef(g, "~MResult := ~V;~N", p); } p->right = NULL; return; } if (inverted) { write_failure_if(g, "~V", p); } else { write_failure_if(g, "not ~V", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(Z, ~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, NULL); break; case c_plusassign: generate_integer_assign(g, p, "+"); break; case c_minusassign: generate_integer_assign(g, p, "-"); break; case c_multiplyassign:generate_integer_assign(g, p, "*"); break; case c_divideassign: generate_integer_assign(g, p, "/"); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } /* Class declaration generation. */ static void generate_unit_start(struct generator * g) { write_start_comment(g, "-- ", NULL); } static void generate_method_decl(struct generator * g, struct name * q) { w(g, "~Mprocedure "); write_varref(g, q); w(g, " (Z : in out Context_Type; Result : out Boolean);~N"); } static void generate_method_decls(struct generator * g, enum name_types type) { for (struct name * q = g->analyser->names; q; q = q->next) { if ((enum name_types)q->type == type) { generate_method_decl(g, q); } } } static void generate_member_decls(struct generator * g) { w(g, " type Context_Type is new Stemmer.Context_Type with"); if (g->analyser->variable_count > 0) { w(g, " record~N~+"); for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: write_margin(g); write_varname(g, q); w(g, " : String (1 .. WORD_MAX_LENGTH);~N"); write_margin(g); write_char(g, 'L'); write_varname(g, q); w(g, " : Char_Index := 0;~N"); break; case t_integer: write_margin(g); write_varname(g, q); w(g, " : Integer;~N"); break; case t_boolean: write_margin(g); write_varname(g, q); w(g, " : Boolean;~N"); break; } } w(g, "~-~Mend record;~N"); } else { w(g, " null record;~N"); } } static int generate_among_string(struct generator * g, struct among * x, int count) { struct amongvec * v = x->b; int limit = count == 0 ? 38 : 80; g->I[0] = x->number; for (int i = 0; i < x->literalstring_count; i++, v++) { /* Write among's string. */ g->I[1] = i; if (count + SIZE(v->b) > limit) { w(g, "~N~M& "); count = 3; limit = 80; } else if (count > 0) { w(g, " & "); } write_literal_string(g, v->b); count += SIZE(v->b) + 5; } return count; } static int generate_among_table(struct generator * g, struct among * x, int start_pos, int *operation) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; g->I[1] = x->literalstring_count - 1; w(g, "~N~MA_~I0 : constant Among_Array_Type (0 .. ~I1) := ~+(~N"); v = x->b; for (int i = 0; i < x->literalstring_count; i++) { g->I[1] = start_pos; /* Write among's string position. */ if (x->literalstring_count == 1) { w(g, "~Mothers => (~I1, "); } else { w(g, "~M(~I1, "); } start_pos = start_pos + SIZE(v[i].b); g->I[1] = start_pos - 1; w(g, "~I1, "); /* Write among's index & result. */ g->I[2] = v[i].i; w(g, "~I2, "); g->I[2] = v[i].result; w(g, "~I2, "); /* Write among's handler. */ if (v[i].function == NULL) { w(g, "0)"); } else { *operation = *operation + 1; g->I[1] = *operation; w(g, "~I1)"); } if (i + 1 < x->literalstring_count) { w(g, ",~N"); } } w(g, ");~-~N"); return start_pos; } static void generate_amongs(struct generator * g) { if (!g->analyser->amongs) return; struct str * s = g->outbuf; g->outbuf = g->declarations; w(g, "~N~MAmong_String : constant String := ~+"); int count = 0; for (struct among * x = g->analyser->amongs; x != NULL; x = x->next) { if (x->used) { count = generate_among_string(g, x, count); } } w(g, ";~N~-"); int operation = 0; int start_pos = 1; for (struct among * x = g->analyser->amongs; x != NULL; x = x->next) { if (x->used) { start_pos = generate_among_table(g, x, start_pos, &operation); } } g->outbuf = s; if (operation == 0) return; operation = 0; w(g, "~N~Mprocedure Among_Handler (Context : in out Stemmer.Context_Type'Class; Operation : in Operation_Index; Result : out Boolean) is~N"); w(g, "~Mbegin~+~N~M"); w(g, "case Operation is~+~N~M"); for (struct among * x = g->analyser->amongs; x; x = x->next) { struct amongvec * v = x->b; for (int i = 0; i < x->literalstring_count; i++) { if (v[i].function != NULL) { operation++; g->I[2] = operation; w(g, "when ~I2 =>~N~M "); write_varname(g, v[i].function); w(g, " (Context_Type (Context), Result);~N~M"); } } } w(g, "when others =>~N~M"); w(g, " Result := False;~-~N~Mend case;~-~N~M"); w(g, "end Among_Handler;~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } static void set_bit(symbol * b, int i) { b[i >> 3] |= 1 << (i & 7); } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7) / 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->I[0] = 8 * size - 1; w(g, "~N~M"); write_varname(g, q->name); w(g, " : constant Grouping_Array (0 .. ~I0) := (~N~+~M"); for (int i = 0; i < size; i++) { unsigned char m = map[i]; if (i) w(g, ",~N~M"); for (int j = 0; j < 8; j++) { if (j) w(g, ", "); if (m & (1 << j)) { w(g, "True"); } else { w(g, "False"); } } } w(g, "~N~-~M);~N"); lose_b(map); } static void generate_groupings(struct generator * g) { struct str * s = g->outbuf; g->outbuf = g->declarations; for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } g->outbuf = s; } extern void generate_program_ada(struct generator * g) { g->margin_indent = " "; g->outbuf = str_new(); g->failure_str = str_new(); generate_unit_start(g); /* generate implementation. */ if (g->analyser->debug_used) { // System.Io is GNAT-specific, but apparently we can't use Text_IO in a // preelaborated package. w(g, "with System.Io;~N"); } w(g, "package body Stemmer."); write_string(g, g->options->package); w(g, " is~N~+~N"); w(g, "~Mpragma Style_Checks (\"-mr\");~N"); w(g, "~Mpragma Warnings (Off, \"*variable*is never read and never assigned*\");~N"); w(g, "~Mpragma Warnings (Off, \"*mode could be*instead of*\");~N"); w(g, "~Mpragma Warnings (Off, \"*formal parameter.*is not modified*\");~N"); w(g, "~Mpragma Warnings (Off, \"*this line is too long*\");~N"); w(g, "~Mpragma Warnings (Off, \"*is not referenced*\");~N"); w(g, "~N"); if (g->analyser->debug_used) { w(g, "~N" "~Mprocedure Debug (Z : in Context_Type'Class; N : in Integer; Line : in Integer) is~N~+" "~MLen : Integer;~N" "~MCh : Integer;~N" "~-~Mbegin~N~+" "~Mif N < 10 then~N~+" "~MSystem.Io.Put (\" \");~N" "~-~Mend if;~N" "~Mif N < 100 then~N~+" "~MSystem.Io.Put (\" \");~N" "~-~Mend if;~N" "~MSystem.Io.Put (N);~N" "~MSystem.Io.Put (\" (line \");~N" "~Mif Line < 10 then~N~+" "~MSystem.Io.Put (\" \");~N" "~-~Mend if;~N" "~Mif Line < 100 then~N~+" "~MSystem.Io.Put (\" \");~N" "~-~Mend if;~N" "~Mif Line < 1000 then~N~+" "~MSystem.Io.Put (\" \");~N" "~-~Mend if;~N" "~MSystem.Io.Put (Line);~N" "~MSystem.Io.Put (\"): [\");~N" "~MSystem.Io.Put (Z.Len);~N" "~MSystem.Io.Put (\"]'\");~N" "~MLen := Z.Len;~N" "~Mfor I in 0 .. Len + 1 loop~N~+" "~Mif Z.Lb = I then~N~+" "~MSystem.Io.Put (\"{\");~N" "~-~Mend if;~N" "~Mif Z.Bra = I then~N~+" "~MSystem.Io.Put (\"[\");~N" "~-~Mend if;~N" "~Mif Z.C = I then~N~+" "~MSystem.Io.Put (\"|\");~N" "~-~Mend if;~N" "~Mif Z.Ket = I then~N~+" "~MSystem.Io.Put (\"]\");~N" "~-~Mend if;~N" "~Mif Z.L = I then~N~+" "~MSystem.Io.Put (\"}\");~N" "~-~Mend if;~N" "~Mif I < Len then~N~+" "~MCh := Character'Pos (Z.P (I + 1));~N" "~Mif Ch = 0 then~N~+" "~MSystem.Io.Put (\"#\");~N" "~-~Melse~N~+" "~MSystem.Io.Put (Z.P (I + 1 .. I + 1));~N" "~-~Mend if;~N" "~-~Mend if;~N" "~-~Mend loop;~N" "~MSystem.Io.Put_Line (\"'\");~N" "~-~Mend Debug;~N"); } generate_method_decls(g, t_routine); bool need_among_handler = false; for (struct among * a = g->analyser->amongs; a; a = a->next) { if (a->function_count > 0) { need_among_handler = true; break; } } if (need_among_handler) { w(g, "~N~Mprocedure Among_Handler (Context : in out Stemmer.Context_Type'Class; Operation : in Operation_Index; Result : out Boolean);~N"); } g->declarations = g->outbuf; g->outbuf = str_new(); generate_methods(g); generate_amongs(g); generate_groupings(g); w(g, "~-end Stemmer."); write_string(g, g->options->package); w(g, ";~N"); output_str(g->options->output_src, g->declarations); str_delete(g->declarations); output_str(g->options->output_src, g->outbuf); str_clear(g->outbuf); write_start_comment(g, "-- ", NULL); w(g, "package Stemmer."); write_string(g, g->options->package); w(g, " with SPARK_Mode is~N~+"); w(g, " type Context_Type is new Stemmer.Context_Type with private;~N"); generate_method_decls(g, t_external); w(g, "private~N"); generate_member_decls(g); w(g, "end Stemmer."); write_string(g, g->options->package); w(g, ";~N"); output_str(g->options->output_h, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.1.0/compiler/generator_c.c000066400000000000000000002065471520373054300176220ustar00rootroot00000000000000#include #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" /* Define this to get warning messages when optimisations can't be used. */ /* #define OPTIMISATION_WARNINGS */ /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { write_c_relop(g, relop); } static void write_varname(struct generator * g, struct name * p) { if (p->type == t_external) { if (g->options->target_lang == LANG_CPLUSPLUS) { write_string(g, g->options->package); write_string(g, "::"); write_s(g, g->options->name); write_string(g, "::"); } if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } } else { /* Name variables using their Snowball name prefixed by s_, b_ or i_ * depending on their type. * * We use the same naming scheme for both global and local variables. */ write_char(g, "sbirxg"[p->type]); write_char(g, '_'); } write_s(g, p->s); } /* Reference to variable, e.g. when assigning to or using in an expression. */ static void write_varref(struct generator * g, struct name * p) { if (p->type < t_routine && p->local_to == NULL) { write_string(g, "((SN_local *)z)->"); } write_varname(g, p); } /* write character literal */ static void wlitch(struct generator * g, int ch) { if (32 <= ch && ch < 127) { write_char(g, '\''); if (ch == '\'' || ch == '\\') { write_char(g, '\\'); } write_char(g, ch); write_char(g, '\''); } else { write_string(g, "0x"); write_hex(g, ch); } } static void wlitarray(struct generator * g, symbol * p) { /* write literal array */ write_string(g, "{ "); for (int i = 0; i < SIZE(p); i++) { if (i) write_string(g, ", "); wlitch(g, p[i]); } write_string(g, " }"); } static void wlitref(struct generator * g, symbol * p) { /* write ref to literal array */ if (SIZE(p) == 0) { write_char(g, '0'); } else { struct str * s = g->outbuf; g->outbuf = g->declarations; write_string(g, "static const symbol s_"); write_int(g, g->literalstring_count); write_string(g, "[] = "); wlitarray(g, p); write_string(g, ";\n"); g->outbuf = s; write_string(g, "s_"); write_int(g, g->literalstring_count); g->literalstring_count++; } } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "/* "); write_comment_content(g, p, "*/"); write_string(g, " */"); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) { if (g->line_labelled == g->line_count) { // Before C23, `;` is required between a label and the block end. w(g, "~M;~N"); } w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "z->l - "; writef(g, "~Mint ~B0 = ~S1z->c;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "z->c = "); if (p->mode != m_forward) str_append_string(out, "z->l - "); str_append(out, savevar); str_append_ch(out, ';'); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void winc(struct generator * g, struct node * p) { /* increment c */ write_string(g, p->mode == m_forward ? "z->c++;" : "z->c--;"); } static void wsetl(struct generator * g, int n) { g->margin--; write_margin(g); write_string(g, "lab"); write_int(g, n); write_char(g, ':'); write_newline(g); g->line_labelled = g->line_count; g->margin++; } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_string(g, "{ "); write_str(g, g->failure_str); write_char(g, ' '); } switch (g->failure_label) { case x_return: write_string(g, "return 0;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_char(g, ';'); g->label_used = 1; } if (str_len(g->failure_str) != 0) write_string(g, " }"); } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_string(g, "if (z->c >= z->l) "); } else { write_string(g, "if (z->c <= z->lb) "); } write_failure(g); } static void write_data_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_int(g, SIZE(b)); w(g, ", "); wlitref(g, b); } else { write_varref(g, p->name); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_failure(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'F': { // Among function dispatcher. struct among * x = p->among; if (x->function_count == 0) { write_char(g, '0'); continue; } if (x->function_count == 1) { // Only one different function used in this among. struct amongvec * v = x->b; for (int j = 0; j < x->literalstring_count; j++) { if (v[j].function) { write_varref(g, v[j].function); goto continue_outer_loop; } } fprintf(stderr, "function_count == 1 but no among functions\n"); exit(1); continue_outer_loop: continue; } w(g, "af_"); write_int(g, x->number); continue; } case 'I': case 'J': case 'c': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; if (ch == 'I') write_int(g, g->I[j]); else if (ch == 'J') wi3(g, g->I[j]); else wlitch(g, g->I[j]); continue; } case 'V': write_varref(g, p->name); continue; case 'W': write_varname(g, p->name); continue; case 'L': wlitref(g, p->literalstring); continue; case 's': write_int(g, SIZE(p->literalstring)); continue; case 'a': write_data_address(g, p); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_s(g, g->options->name); continue; case '$': /* insert_s, insert_v etc */ write_char(g, p->literalstring == NULL ? 'v' : 's'); continue; case 'p': if (g->options->externals_prefix) write_string(g, g->options->externals_prefix); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } /* Write out a statement with additional code to propagate a negative return * value which indicates an error. * * When generating C++, such errors throw exceptions so we don't need to * check for negative return values. */ static void write_propagating_error(struct generator * g, const char * s, int keep_c, struct node *p) { if (g->options->target_lang == LANG_CPLUSPLUS) { if (keep_c) { write_block_start(g); w(g, "~Mint saved_c = z->c;~N"); } write_margin(g); writef(g, s, p); w(g, ";~N"); if (keep_c) { w(g, "~Mz->c = saved_c;~N"); write_block_end(g); } } else { write_block_start(g); if (keep_c) { w(g, "~Mint saved_c = z->c;~N"); } w(g, "~Mint ret = "); writef(g, s, p); w(g, ";~N"); if (keep_c) { w(g, "~Mz->c = saved_c;~N"); } w(g, "~Mif (ret < 0) return ret;~N"); write_block_end(g); } } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "INT_MAX"); break; case c_minint: write_string(g, "INT_MIN"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_divide: s = " / "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "z->c"); break; case c_limit: w(g, p->mode == m_forward ? "z->l" : "z->lb"); break; case c_len: if (g->options->encoding == ENC_UTF8) { w(g, "len_utf8(z->p)"); break; } /* FALLTHRU */ case c_size: w(g, "SIZE(z->p)"); break; case c_lenof: if (g->options->encoding == ENC_UTF8) { writef(g, "len_utf8(~V)", p); break; } /* FALLTHRU */ case c_sizeof: writef(g, "SIZE(~V)", p); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { write_block_end(g); str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); w(g, "~Mdo {~N~+"); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; g->label_used = 0; generate(g, p); w(g, "~Mbreak;~N"); if (g->label_used) wsetl(g, g->failure_label); if (savevar && K_needed_node_on_f(p)) { write_restorecursor(g, p, savevar); } p = p->right; } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); write_block_end(g); if (str_back(g->outbuf) == '\n') { str_pop(g->outbuf); } w(g, " while (0);~N"); if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mz->lb = z->c; z->c = z->l;~N", p); generate(g, p->left); w(g, "~Mz->c = z->lb;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); g->label_used = 0; generate(g, p->left); int l = g->failure_label; int u = g->label_used; g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; writef(g, "~M~f~N", p); if (u) wsetl(g, l); if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); g->label_used = 0; if (savevar) { append_restore_string(p, g->failure_str, savevar); } generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_block_end(g); str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); if (g->options->target_lang == LANG_CPLUSPLUS) { writef(g, "~M~V = true;~N", p); } else { writef(g, "~M~V = 1;~N", p); } } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); if (g->options->target_lang == LANG_CPLUSPLUS) { writef(g, "~M~V = false;~N", p); } else { writef(g, "~M~V = 0;~N", p); } } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); writef(g, "~M~f~N", p); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); write_propagating_error(g, "~V(z)", false, p->left); } else { int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); } if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); if (g->options->encoding == ENC_UTF8) { if (p->mode == m_forward) w(g, "~{~Mint ret = skip_utf8(z->p, z->c, z->l, 1"); else w(g, "~{~Mint ret = skip_b_utf8(z->p, z->c, z->lb, 1"); writef(g, ");~N" "~Mif (ret < 0) ~f~N" "~Mz->c = ret;~N" "~}", p); } else { write_margin(g); write_check_limit(g, p); write_newline(g); write_margin(g); winc(g, p); write_newline(g); } } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (is_goto) { writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V, ~I0, ~I1, 1) < 0) ~f~N", p); } else { writef(g, "~{" "~Mint ret = ~S1_grouping~S0~S2(z, ~V, ~I0, ~I1, 1);~N" "~Mif (ret < 0) ~f~N", p); if (p->mode == m_forward) w(g, "~Mz->c += ret;~N"); else w(g, "~Mz->c -= ret;~N"); w(g, "~}"); } } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); w(g, "~Mwhile (1) {~N~+"); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); w(g, "~Mbreak;~N"); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate_next(g, p); w(g, "~}"); } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); if (g->options->target_lang == LANG_C) { w(g, "~{~Mint i; for (i = "); } else { w(g, "~Mfor (int i = "); } generate_AE(g, p->AE); writef(g, "; i > 0; i--) {~N~+", p); generate(g, p->left); w(g, "~}"); if (g->options->target_lang == LANG_C) { w(g, "~}"); } } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { writef(g, "~Mwhile (1) {~+~N", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~Mbreak;~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~{~Mint ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); writef(g, "~Mif (~B0 > 0) ~f~N", p); w(g, "~}"); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (z->c ~S0 "); generate_AE(g, p->AE); writef(g, ") ~f~N", p); w(g, "~Mz->c = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); if (g->options->encoding == ENC_UTF8) { g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = p->mode == m_forward ? "z->l" : "z->lb"; w(g, "~{"); if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. g->I[0] = p->AE->number; w(g, "~Mint ret = skip~S0_utf8(z->p, z->c, ~S1, ~I0);~N"); } else { w(g, "~Mint ae = "); generate_AE(g, p->AE); w(g, ";~N"); w(g, "~Mint ret = ae >= 0 ? skip~S0_utf8(z->p, z->c, ~S1, ae) : -1;~N"); } w(g, "~Mif (ret < 0) ~f~N"); w(g, "~Mz->c = ret;~N"); w(g, "~}"); } else { // Fixed-width characters. g->S[0] = p->mode == m_forward ? "+" : "-"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. g->I[0] = p->AE->number; if (p->mode == m_forward) { writef(g, "~Mif (z->c ~S0 ~I0 > z->l) ~f~N", p); } else { writef(g, "~Mif (z->c ~S0 ~I0 < z->lb) ~f~N", p); } writef(g, "~Mz->c ~S0= ~I0;~N", p); } else { w(g, "~{~Mint ret = z->c ~S0 "); generate_AE(g, p->AE); writef(g, ";~N", p); if (p->mode == m_forward) { writef(g, "~Mif (ret > z->l || ret < z->c) ~f~N", p); } else { writef(g, "~Mif (ret < z->lb || ret > z->c) ~f~N", p); } writef(g, "~Mz->c = ret;~N" "~}", p); } } } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); write_propagating_error(g, "slice_del(z)", false, p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "b"; writef(g, "~Mz->c = z->l~S0;~N", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "bra" : "ket"; writef(g, "~Mz->~S0 = z->c;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "ket" : "bra"; writef(g, "~Mz->~S0 = z->c;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); write_propagating_error(g, "assign_to(z, &~V)", false, p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); write_propagating_error(g, "slice_to(z, &~V)", false, p); } static void generate_insert(struct generator * g, struct node * p, int style) { write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; write_propagating_error(g, "insert_~$(z, z->c, z->c, ~a)", keep_c, p); } static void generate_stringassign(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { /* like 'attach' */ write_propagating_error(g, "insert_~$(z, z->c, z->l, ~a)", true, p); } else { write_propagating_error(g, "insert_~$(z, z->lb, z->c, ~a)", false, p); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); write_propagating_error(g, "slice_from_~$(z, ~a)", false, p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * varname = vars_newname(g); bool extra_block = false; if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); assert(q->right == NULL); g->B[0] = str_data(varname); writef(g, "~{~Mint ~B0;~N", p); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (z->c ~S0 "); generate_AE(g, q->AE); writef(g, ") ~f~N", q); g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~M~B0 = z->l; z->l = "); generate_AE(g, q->AE); w(g, "; ~B0 -= z->l;~N"); } else { w(g, "~M~B0 = z->lb; z->lb = "); generate_AE(g, q->AE); w(g, ";~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "z->l += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "z->lb = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { write_block_start(g); extra_block = true; struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~{~Mint ~B0 = z->l - z->c; z->l = z->c;~N"); } else { w(g, "~{~Mint ~B0 = z->lb; z->lb = z->c;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "z->l += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "z->lb = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } str_delete(savevar); } generate(g, p->aux); write_margin(g); write_str(g, g->failure_str); w(g, "~N" "~}"); if (extra_block) { write_block_end(g); } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); // We only want to save and restore SN_env, not the variables. writef(g, "~{~Mstruct SN_env en~B0 = *z;~N", p); if (p->left->possible_signals == -1) { /* Assume failure. */ w(g, "~Mint ~B0_f = 1;~N"); } writef(g, "~Mz->p = ~V;~N" "~Mz->lb = z->c = 0;~N" "~Mz->l = SIZE(z->p);~N", p); generate(g, p->left); if (p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_f = 0;~N"); } if (g->label_used) wsetl(g, g->failure_label); g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); writef(g, "~M~V = z->p;~N" "~M*z = en~B0;~N", p); if (p->left->possible_signals == 0) { // p->left always signals f. w(g, "~M~f~N"); } else if (p->left->possible_signals == -1) { w(g, "~Mif (~B0_f) ~f~N"); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->S[0] = s; writef(g, "~M~V ~S0 ", p); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif ("); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { writef(g, ";~N", p); } else { writef(g, ") ~f~N", p); } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~Mreturn ~V(z);~N", p); p->right = NULL; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~Mreturn ~V(z);~N", p); return; } if (just_return_on_fail(g)) { write_block_start(g); writef(g, "~Mint ret = ~V(z);~N", p); if (g->options->target_lang == LANG_CPLUSPLUS) { writef(g, "~Mif (ret == 0) return ret;~N", p); } else { /* For C, we need to propagate both failures and runtime errors so * we do a combined test for better optimisation and clearer * generated code. */ writef(g, "~Mif (ret <= 0) return ret;~N", p); } write_block_end(g); } else { if (signals == 1) { /* Always succeeds - just need to handle runtime errors. */ write_propagating_error(g, "~V(z)", false, p); } else if (signals == 0) { /* Always fails. */ write_propagating_error(g, "~V(z)", false, p); writef(g, "~M~f~N", p); } else { if (g->options->target_lang == LANG_CPLUSPLUS) { writef(g, "~Mif (!~V(z)) ~f~N", p); } else { write_block_start(g); writef(g, "~Mint ret = ~V(z);~N", p); writef(g, "~Mif (ret == 0) ~f~N", p); writef(g, "~Mif (ret < 0) return ret;~N", p); write_block_end(g); } } } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (tailcallable(g, p)) { writef(g, "~Mreturn !~S1_grouping~S0~S2(z, ~V, ~I0, ~I1, 0);~N", p); p->right = NULL; } else { writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V, ~I0, ~I1, 0)) ~f~N", p); } } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn eq_v~S0(z, ~V);~N", p); p->right = NULL; } else { writef(g, "~Mif (!(eq_v~S0(z, ~V))) ~f~N", p); } } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); symbol * b = p->literalstring; if (SIZE(b) == 1) { /* It's quite common to compare with a single character literal string, * so just inline the simpler code for this case rather than making a * function call. In UTF-8 mode, only do this for the ASCII subset, * since multi-byte characters are more complex to test against. */ if (g->options->encoding == ENC_UTF8 && *b >= 128) { printf("single byte %d\n", *b); exit(1); } g->I[0] = *b; if (p->mode == m_forward) { writef(g, "~Mif (z->c == z->l || z->p[z->c] != ~c0) ~f~N" "~Mz->c++;~N", p); } else { writef(g, "~Mif (z->c <= z->lb || z->p[z->c - 1] != ~c0) ~f~N" "~Mz->c--;~N", p); } return; } g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn eq_s~S0(z, ~s, ~L);~N", p); p->right = NULL; } else { writef(g, "~Mif (!(eq_s~S0(z, ~s, ~L))) ~f~N", p); } } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; write_newline(g); write_comment(g, p); if (q->type == t_routine) { write_string(g, "static "); } else if (g->options->target_lang == LANG_C) { write_string(g, "extern "); } writef(g, "int ~V(struct SN_env * z) {~N~+", p); if (q->amongvar_needed) { w(g, "~Mint among_var;~N"); } /* Declare localised variables. */ for (struct name * name = g->analyser->names; name; name = name->next) { if (name->local_to == q) { switch (name->type) { case t_string: assert(0); break; case t_integer: w(g, "~Mint "); write_varname(g, name); w(g, ";~N"); break; case t_boolean: if (g->options->target_lang == LANG_CPLUSPLUS) { w(g, "~Mbool "); } else { w(g, "~Mint "); } write_varname(g, name); w(g, ";~N"); break; } } } g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->label_used = 0; /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (p->left->possible_signals) { generate(g, p->left->right); } } w(g, "~}"); } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn 1;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; int block = -1; unsigned int bitmap = 0; struct amongvec * among_cases = x->b; int empty_case = -1; int n_cases = 0; symbol cases[2]; int shortest_size = x->shortest_size; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; g->I[1] = x->literalstring_count; /* In forward mode with non-ASCII UTF-8 characters, the first byte * of the string will often be the same, so instead look at the last * common byte position. * * In backward mode, we can't match if there are fewer characters before * the current position than the minimum length. */ for (int c = 0; c < x->literalstring_count; ++c) { symbol ch; if (among_cases[c].size == 0) { empty_case = c; continue; } if (p->mode == m_forward) { ch = among_cases[c].b[shortest_size - 1]; } else { ch = among_cases[c].b[among_cases[c].size - 1]; } if (n_cases == 0) { block = ch >> 5; } else if (ch >> 5 != block) { block = -1; if (n_cases > 2) break; } if (block == -1) { if (n_cases > 0 && ch == cases[0]) continue; if (n_cases < 2) { cases[n_cases++] = ch; } else if (ch != cases[1]) { ++n_cases; break; } } else { if ((bitmap & (1u << (ch & 0x1f))) == 0) { bitmap |= 1u << (ch & 0x1f); if (n_cases < 2) cases[n_cases] = ch; ++n_cases; } } } bool pre_check = (block != -1 || n_cases <= 2); if (g->options->coverage) { // Don't shortcut if generating coverage. pre_check = false; } if (pre_check) { char buf[64]; g->I[2] = block; g->I[3] = bitmap; g->I[4] = shortest_size - 1; if (p->mode == m_forward) { checked_snprintf(buf, sizeof(buf), "z->p[z->c + %d]", shortest_size - 1); g->S[1] = buf; if (shortest_size == 1) { writef(g, "~Mif (z->c >= z->l", p); } else { writef(g, "~Mif (z->c + ~I4 >= z->l", p); } } else { g->S[1] = "z->p[z->c - 1]"; if (shortest_size == 1) { writef(g, "~Mif (z->c <= z->lb", p); } else { writef(g, "~Mif (z->c - ~I4 <= z->lb", p); } } assert(n_cases > 0); if (n_cases == 1) { g->I[4] = cases[0]; writef(g, " || ~S1 != ~I4", p); } else if (n_cases == 2) { g->I[4] = cases[0]; g->I[5] = cases[1]; writef(g, " || (~S1 != ~I4 && ~S1 != ~I5)", p); } else { writef(g, " || ~S1 >> 5 != ~I2 || !((~I3 >> (~S1 & 0x1f)) & 1)", p); } write_string(g, ") "); if (empty_case != -1 && !among_cases[empty_case].function) { /* If the among includes the ungated empty string, it can never * fail so not matching the bitmap means we match the empty string. */ g->I[4] = among_cases[empty_case].result; writef(g, "among_var = ~I4; else~N", p); } else { writef(g, "~f~N", p); } } else { #ifdef OPTIMISATION_WARNINGS printf("Couldn't shortcut among %d\n", x->number); #endif } if (x->amongvar_needed) { writef(g, "~Mamong_var = find_among~S0(z, a_~I0, ~I1, ~F);~N", p); if (!x->always_matches) { writef(g, "~Mif (!among_var) ~f~N", p); } return; } if (pre_check && !x->function_count) { // If all cases are one symbol long (so one byte of UTF-8, one // character long in fixed-width encodings) then we don't need to call // the helper and can just inc/dec the cursor by 1. if (x->longest_size == 1 && !x->always_matches) { write_margin(g); winc(g, p); write_newline(g); // Suppress generating table for this among. x->used = false; return; } } if (x->always_matches) { writef(g, "~Mfind_among~S0(z, a_~I0, ~I1, ~F);~N", p); } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~Mreturn find_among~S0(z, a_~I0, ~I1, ~F) != 0;~N", p); x->node->right = NULL; } else { writef(g, "~Mif (!find_among~S0(z, a_~I0, ~I1, ~F)) ~f~N", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { writef(g, "~Mswitch (among_var) {~N~+", p); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); w(g, "~Mbreak;~N~-"); } w(g, "~}"); } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~Mreturn !~V;~N", p); } else { writef(g, "~Mreturn ~V;~N", p); } p->right = NULL; return; } if (inverted) { writef(g, "~Mif (~V) ~f~N", p); } else { writef(g, "~Mif (!~V) ~f~N", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~f~N", p); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(z, ~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } if (g->failure_label != a0) g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_head(struct generator * g) { struct options * o = g->options; if (o->cheader) { int quoted = (o->cheader[0] == '<' || o->cheader[0] == '"'); w(g, "#include "); if (!quoted) write_char(g, '<'); write_string(g, o->cheader); if (!quoted) write_char(g, '>'); write_newline(g); write_newline(g); } if (o->target_lang == LANG_CPLUSPLUS) { w(g, "#define SNOWBALL_RUNTIME_THROW_EXCEPTIONS~N"); } if (g->analyser->debug_used) { w(g, "#define SNOWBALL_DEBUG_COMMAND_USED~N"); } w(g, "#include \""); write_s(g, o->output_leaf); w(g, ".h\"~N~N"); if (g->analyser->int_limits_used) { w(g, "#include ~N"); } w(g, "#include ~N~N"); if (o->target_lang == LANG_CPLUSPLUS) { w(g, "~Mtypedef "); write_string(g, o->package); w(g, "::~n::SN_local SN_local;~N~N"); return; } w(g, "#include \""); if (o->runtime_path) { write_string(g, o->runtime_path); if (o->runtime_path[strlen(o->runtime_path) - 1] != '/') write_char(g, '/'); } w(g, "snowball_runtime.h\"~N~N"); if (g->analyser->variable_count > 0) { // Generate the struct SN_local definition, which embeds a struct // SN_env and also holds non-localised variables. We group variables // by type to try to produce more efficient struct packing. w(g, "struct SN_local {~N~+" "~Mstruct SN_env z;~N"); for (struct name * name = g->analyser->names; name; name = name->next) { if (!name->local_to && name->type == t_integer) { w(g, "~Mint "); write_varname(g, name); w(g, ";~N"); } } for (struct name * name = g->analyser->names; name; name = name->next) { if (!name->local_to && name->type == t_boolean) { if (g->options->target_lang == LANG_CPLUSPLUS) { w(g, "~Mbool "); } else { w(g, "~Munsigned char "); } write_varname(g, name); w(g, ";~N"); } } for (struct name * name = g->analyser->names; name; name = name->next) { if (!name->local_to && name->type == t_string) { w(g, "~Msymbol * "); write_varname(g, name); w(g, ";~N"); } } w(g, "~-~M};~N~N"); if (g->options->target_lang == LANG_C) { w(g, "typedef struct SN_local SN_local;~N~N"); } } const char * vp = g->options->variables_prefix; if (vp) { for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: w(g, "extern const symbol * "); write_string(g, vp); write_s(g, q->s); w(g, "(struct SN_env * z) {~N~+"); w(g, "~Msymbol * p = "); write_varref(g, q); w(g, ";~N" "~Mp[SIZE(p)] = 0;~N" "~Mreturn p;~N~-" "}~N~N"); break; case t_integer: w(g, "extern int "); write_string(g, vp); write_s(g, q->s); w(g, "(struct SN_env * z) {~N~+" "~Mreturn "); write_varref(g, q); w(g, ";~N~-" "}~N~N"); break; case t_boolean: if (g->options->target_lang == LANG_CPLUSPLUS) { w(g, "extern bool "); } else { w(g, "extern int "); } write_string(g, vp); write_s(g, q->s); w(g, "(struct SN_env * z) {~N~+" "~Mreturn "); write_varref(g, q); w(g, ";~N~-" "}~N~N"); break; } } } } static void generate_routine_declarations(struct generator * g) { if (g->options->target_lang == LANG_C) { w(g, "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N"); for (struct name * q = g->analyser->names; q; q = q->next) { if (q->type == t_external) { w(g, "extern int "); write_varname(g, q); w(g, "(struct SN_env * z);~N"); } } w(g, "#ifdef __cplusplus~N" "}~N" "#endif~N~N"); } if (g->analyser->name_count[t_routine]) { for (struct name * q = g->analyser->names; q; q = q->next) { if (q->type == t_routine) { w(g, "static int "); write_varname(g, q); w(g, "(struct SN_env * z);~N"); } } write_newline(g); } } static void generate_among_table(struct generator * g, struct among * x) { write_newline(g); write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; for (int i = 0; i < x->literalstring_count; i++) { if (v[i].size) { g->I[1] = i; g->I[2] = v[i].size; w(g, "static const symbol s_~I0_~I1[~I2] = "); wlitarray(g, v[i].b); w(g, ";~N"); } } g->I[1] = x->literalstring_count; if (g->options->coverage) { g->I[1] = g->I[1] * 2 + 1; } w(g, "~Mstatic const struct among a_~I0[~I1] = {~N"); for (int i = 0; i < x->literalstring_count; i++) { if (i) w(g, ",~N"); g->I[1] = i; g->I[2] = v[i].size; g->I[3] = (v[i].i >= 0 ? v[i].i - i : 0); g->I[4] = v[i].result; g->I[5] = v[i].function_index; if (g->options->comments) { w(g, "/*~J1 */ "); } w(g, "{ ~I2, "); if (v[i].size == 0) { w(g, "0,"); } else { w(g, "s_~I0_~I1,"); } w(g, " ~I3, ~I4, ~I5}"); } if (g->options->coverage) { w(g, ",~N"); g->S[1] = g->analyser->tokeniser->file; for (int i = 0; i < x->literalstring_count; i++) { if (g->options->comments) { w(g, "/* coverage */ "); } g->I[1] = x->b[i].line_number; g->I[2] = x->b[i].string_index; w(g, "{ ~I0, (const symbol*)\"~S1:~I1\", 0, ~I2, 0 },~N"); } if (x->always_matches) { g->I[0] = -1; } if (g->options->comments) { w(g, "/* coverage */ "); } g->I[1] = x->node->line_number; w(g, "{ ~I0, (const symbol*)\"~S1:~I1\", 0, 0, 0 },~N"); } w(g, "~N};~N"); if (x->function_count <= 1) return; w(g, "~N~Mstatic int af_~I0(struct SN_env * z) {~N~+"); w(g, "~Mswitch (z->af) {~N~+"); for (int n = 1; n <= x->function_count; n++) { w(g, "~Mcase "); write_int(g, n); w(g, ": return "); for (int i = 0; i < x->literalstring_count; i++) { if (v[i].function_index == n) { write_varref(g, v[i].function); w(g, "(z);~N"); break; } } } w(g, "~-~M}~N"); w(g, "~Mreturn -1;~N"); w(g, "~-~M}~N"); } static void generate_amongs(struct generator * g) { struct str * s = g->outbuf; g->outbuf = g->declarations; for (struct among * x = g->analyser->amongs; x; x = x->next) { if (x->used) generate_among_table(g, x); } g->outbuf = s; } static void set_bit(symbol * b, int i) { b[i >> 3] |= 1 << (i & 7); } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7) / 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); w(g, "~Nstatic const unsigned char "); write_varname(g, q->name); w(g, "[] = { "); for (int i = 0; i < size; i++) { if (i) w(g, ", "); write_int(g, map[i]); } if (g->options->coverage) { int grouping_number = q->name->count; if (grouping_number > 255) grouping_number = 255; w(g, ", "); wlitch(g, grouping_number); char buf[1024]; checked_snprintf(buf, sizeof(buf), "%s:%d: grouping %.*s", g->analyser->tokeniser->file, q->line_number, SIZE(q->name->s), q->name->s); for (const char * p = buf; *p; ++p) { w(g, ", "); wlitch(g, (int)*p); } w(g, ", '\\0'"); } w(g, " };~N"); lose_b(map); } static void generate_groupings(struct generator * g) { struct str * s = g->outbuf; g->outbuf = g->declarations; for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } g->outbuf = s; } static void generate_create(struct generator * g) { w(g, "~N" "extern struct SN_env * ~pcreate_env(void) {~N~+"); if (g->analyser->variable_count == 0) { w(g, "~Mreturn SN_new_env(sizeof(struct SN_env));~N"); } else { w(g, "~Mstruct SN_env * z = SN_new_env(sizeof(SN_local));~N" "~Mif (z) {~N~+"); for (struct name * name = g->analyser->names; name; name = name->next) { if (!name->local_to) { switch (name->type) { case t_string: w(g, "~M"); write_varref(g, name); w(g, " = NULL;~N"); break; case t_integer: w(g, "~M"); write_varref(g, name); w(g, " = 0;~N"); break; case t_boolean: w(g, "~M"); write_varref(g, name); if (g->options->target_lang == LANG_CPLUSPLUS) { w(g, " = false;~N"); } else { w(g, " = 0;~N"); } break; } } } if (g->analyser->name_count[t_string] > 0) { write_newline(g); // To simplify error handling, we initialise all strings to NULL // above, then try to allocate them in a second pass. for (struct name * name = g->analyser->names; name; name = name->next) { if (!name->local_to) { switch (name->type) { case t_string: w(g, "~Mif (("); write_varref(g, name); w(g, " = create_s()) == NULL) {~N~+" "~M~pclose_env(z);~N" "~Mreturn NULL;~N~-" "~M}~N"); break; } } } } w(g, "~-~M}~N" "~Mreturn z;~N"); } w(g, "~-}~N"); } static void generate_close(struct generator * g) { w(g, "~Nextern void ~pclose_env(struct SN_env * z) {~N~+"); if (g->analyser->name_count[t_string] > 0) { w(g, "~Mif (!z) return;~N"); for (struct name * name = g->analyser->names; name; name = name->next) { if (!name->local_to && name->type == t_string) { w(g, "~Mlose_s("); write_varref(g, name); w(g, ");~N"); } } } // Note: SN_delete_env() no-ops if z is NULL so we don't to gate this call. w(g, "~MSN_delete_env(z);~N" "~-}~N~N"); } static void generate_header_file(struct generator * g) { struct options * o = g->options; if (o->hheader) { int quoted = (o->hheader[0] == '<' || o->hheader[0] == '"'); w(g, "#include "); if (!quoted) write_char(g, '<'); write_string(g, o->hheader); if (!quoted) write_char(g, '>'); write_newline(g); write_newline(g); } if (o->target_lang == LANG_CPLUSPLUS) { w(g, "#define SNOWBALL_RUNTIME_THROW_EXCEPTIONS~N" "#include \""); if (o->runtime_path) { write_string(g, o->runtime_path); if (o->runtime_path[strlen(o->runtime_path) - 1] != '/') write_char(g, '/'); } w(g, "snowball_runtime.h\"~N~N"); w(g, "namespace "); write_string(g, o->package); w(g, " {~N~N"); w(g, "class ~n : public "); write_string(g, o->parent_class_name); w(g, " {~N" " public:~N~+"); } if (o->target_lang == LANG_C) { w(g, "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N"); /* for C++ */ w(g, "~N" "extern struct SN_env * ~pcreate_env(void);~N" "extern void ~pclose_env(struct SN_env * z);~N" "~N"); } const char * vp = o->variables_prefix; if (vp) { for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: if (o->target_lang == LANG_CPLUSPLUS) { w(g, "~Mconst symbol * "); } else { w(g, "extern const symbol * "); } write_string(g, vp); write_s(g, q->s); if (o->target_lang == LANG_CPLUSPLUS) { w(g, "() {~N~+" "~Mstruct SN_env * z = &(zlocal.z);~N" "~Msymbol * p = "); write_varref(g, q); w(g, ";~N" "~Mp[SIZE(p)] = 0;~N" "~Mreturn p;~N~-" "~M}~N~N"); } else { w(g, "(struct SN_env * z);~N"); } break; case t_integer: if (o->target_lang == LANG_CPLUSPLUS) { w(g, "~Mint "); } else { w(g, "extern int "); } write_string(g, vp); write_s(g, q->s); if (o->target_lang == LANG_CPLUSPLUS) { w(g, "() {~N~+" "~Mstruct SN_env * z = &(zlocal.z);~N" "~Mreturn "); write_varref(g, q); w(g, ";~N~-" "~M}~N~N"); } else { w(g, "(struct SN_env * z);~N"); } break; case t_boolean: if (o->target_lang == LANG_CPLUSPLUS) { w(g, "~Mbool "); } else { w(g, "extern int "); } write_string(g, vp); write_s(g, q->s); if (o->target_lang == LANG_CPLUSPLUS) { w(g, "() {~N~+" "~Mstruct SN_env * z = &(zlocal.z);~N" "~Mreturn "); write_varref(g, q); w(g, ";~N~-" "~M}~N~N"); } else { w(g, "(struct SN_env * z);~N"); } break; } } } if (o->target_lang == LANG_C) { for (struct name * q = g->analyser->names; q; q = q->next) { if (q->type == t_external) { w(g, "extern int "); write_varname(g, q); w(g, "(struct SN_env * z);~N"); } } w(g, "~N" "#ifdef __cplusplus~N" "}~N" "#endif~N"); /* for C++ */ } if (o->target_lang == LANG_CPLUSPLUS) { // Generate the struct SN_local definition, which embeds a struct // SN_env and also holds any non-localised variables. We group // variables by type to try to produce more efficient struct packing. w(g, "~Mstruct SN_local {~N~+" "~Mstruct SN_env z;~N"); for (struct name * name = g->analyser->names; name; name = name->next) { if (!name->local_to && name->type == t_integer) { w(g, "~Mint "); write_varname(g, name); w(g, ";~N"); } } for (struct name * name = g->analyser->names; name; name = name->next) { if (!name->local_to && name->type == t_boolean) { if (g->options->target_lang == LANG_CPLUSPLUS) { w(g, "~Mbool "); } else { w(g, "~Munsigned char "); } write_varname(g, name); w(g, ";~N"); } } for (struct name * name = g->analyser->names; name; name = name->next) { if (!name->local_to && name->type == t_string) { w(g, "~Msymbol * "); write_varname(g, name); w(g, ";~N"); } } w(g, "~-~M};~N" "~N"); w(g, "~- private:~N~+" "~MSN_local zlocal = {};~N" "~N" "~Mvoid close_env() {~N~+" "~Mstruct SN_env * z = &(zlocal.z);~N"); if (g->analyser->name_count[t_string] > 0) { for (struct name * name = g->analyser->names; name; name = name->next) { if (!name->local_to && name->type == t_string) { w(g, "~Mlose_s("); write_varref(g, name); w(g, ");~N"); } } } w(g, "~Mlose_s(z->p);~N" "~-~M}~N~N"); for (struct name * q = g->analyser->names; q; q = q->next) { if (!q->local_to && q->type == t_external) { w(g, "~Mstatic int "); if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } write_s(g, q->s); w(g, "(struct SN_env * z);~N~N"); } } w(g, "~- public:~N~+" "~M~n() {~N~+" "~Mstruct SN_env * z = &(zlocal.z);~N" "~Mz->p = create_s();~N"); if (g->analyser->name_count[t_string] > 0) { w(g, "~Mtry {~N~+"); for (struct name * name = g->analyser->names; name; name = name->next) { if (!name->local_to && name->type == t_string) { write_margin(g); write_varref(g, name); w(g, " = create_s();~N"); } } w(g, "~-~M} catch (...) {~N~+" "~Mclose_env();~N" "~Mthrow;~N" "~-~M}~N"); } w(g, "~-~M}~N~N" "~M~~~n() {~N~+" "~Mclose_env();~N" "~-~M}~N~N" "~Mstd::string operator()(const std::string& word) override {~N~+" "~Mstruct SN_env* z = &(zlocal.z);~N" "~Mconst symbol* s = reinterpret_cast(word.data());~N" "~Mint s_size = word.size() > INT_MAX ? INT_MAX : word.size();~N" "~Mreplace_s(z, 0, z->l, s_size, s);~N" "~Mz->c = 0;~N" "~M"); write_string(g, o->package); write_string(g, "::"); write_s(g, o->name); write_string(g, "::"); if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } w(g, "stem(z);~N" "~Mreturn std::string(reinterpret_cast(z->p), SIZE(z->p));~N" "~-~M}~N" "~-~M};~N~N"); w(g, "}~N"); } } extern void generate_program_c(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "/* ", " */"); generate_head(g); generate_routine_declarations(g); g->declarations = g->outbuf; g->outbuf = str_new(); g->literalstring_count = 0; for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); } generate_amongs(g); generate_groupings(g); if (g->options->target_lang != LANG_CPLUSPLUS) { generate_create(g); generate_close(g); } output_str(g->options->output_src, g->declarations); str_delete(g->declarations); output_str(g->options->output_src, g->outbuf); str_clear(g->outbuf); write_start_comment(g, "/* ", " */"); generate_header_file(g); output_str(g->options->output_h, g->outbuf); str_delete(g->outbuf); str_delete(g->failure_str); } snowball-3.1.0/compiler/generator_csharp.c000066400000000000000000001227771520373054300206620ustar00rootroot00000000000000#include #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { // Relational operators are the same as C. write_c_relop(g, relop); } static void write_varname(struct generator * g, struct name * p) { if (p->type == t_external) { if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } } else { // We use the same naming scheme for both global and local variables. write_char(g, "SBIrxg"[p->type]); write_char(g, '_'); } write_s(g, p->s); } /* Reference to variable, e.g. when assigning to or using in an expression. */ static void write_varref(struct generator * g, struct name * p) { /* In C#, references look just the same */ write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { write_char(g, '"'); for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; // Write out ASCII and lower Unicode printables as literal characters. // Use escapes for anything over 0x590 as a crude way to avoid LTR // characters affecting the rendering of source character order in // confusing ways. if ((32 <= ch && ch < 127) || (0xa0 < ch && ch < 0x590)) { if (ch == '"' || ch == '\\') write_char(g, '\\'); // Our C# generator uses ENC_WIDECHARS so we need to convert. write_wchar_as_utf8(g, ch); } else { write_string(g, "\\u"); write_hex4(g, ch); } } write_char(g, '"'); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p, NULL); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "limit - "; writef(g, "~Mint ~B0 = ~S1cursor;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "cursor = "); if (p->mode != m_forward) str_append_string(out, "limit - "); str_append(out, savevar); str_append_ch(out, ';'); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;"); write_newline(g); } static void wsetl(struct generator * g, int n) { w(g, "~-~Mlab~+"); write_int(g, n); w(g, ": ; ~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "goto lab"); write_int(g, n); write_char(g, ';'); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_block_start(g); write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "return false;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_char(g, ';'); g->label_used = 1; } write_newline(g); if (str_len(g->failure_str) != 0) write_block_end(g); } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "cursor >= limit", p); } else { write_failure_if(g, "cursor <= limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'F': { // Among function dispatcher. struct among * x = p->among; if (x->function_count == 0) { write_string(g, "null"); continue; } if (x->function_count == 1) { // Only one different function used in this among. struct amongvec * v = x->b; for (int j = 0; j < x->literalstring_count; j++) { if (v[j].function) { write_varref(g, v[j].function); goto continue_outer_loop; } } fprintf(stderr, "function_count == 1 but no among functions\n"); exit(1); continue_outer_loop: continue; } w(g, "af_"); write_int(g, x->number); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': write_varref(g, p->name); continue; case 'W': write_varname(g, p->name); continue; case 'L': write_literal_string(g, p->literalstring); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_s(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "int.MaxValue"); break; case c_minint: write_string(g, "int.MinValue"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_divide: s = " / "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "cursor"); break; case c_limit: w(g, p->mode == m_forward ? "limit" : "limit_backward"); break; case c_len: /* Same as size() for C#. */ case c_size: w(g, "current.Length"); break; case c_lenof: /* Same as sizeof() for C#. */ case c_sizeof: writef(g, "~V.Length", p); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); write_comment(g, p); if (savevar && K_needed_node_on_f(p)) { write_block_start(g); write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; g->label_used = 0; generate(g, p); wgotol(g, out_lab); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); } p = p->right; } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); if (savevar) { write_block_end(g); str_delete(savevar); } wsetl(g, out_lab); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mlimit_backward = cursor;~N" "~Mcursor = limit;~N", p); generate(g, p->left); w(g, "~Mcursor = limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); int l = g->failure_label; int u = g->label_used; g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_failure(g); if (u) wsetl(g, l); if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_block_end(g); str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); writef(g, "~M~V();~N", p->left); } else { int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); } if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (is_goto) { writef(g, "~Mif (~S1_grouping~S0(~V, ~I0, ~I1, true) < 0)~N~f~N", p); } else { writef(g, "~{~N" "~Mint ret = ~S1_grouping~S0(~V, ~I0, ~I1, true);~N" "~Mif (ret < 0)~N~f~N", p); if (p->mode == m_forward) w(g, "~Mcursor += ret;~N"); else w(g, "~Mcursor -= ret;~N"); w(g, "~}"); } } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); w(g, "~Mwhile (true)~N~{"); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); w(g, "~Mbreak;~N"); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); w(g, "~}"); } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mfor (int ~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--)~N", p); write_block_start(g); generate(g, p->left); write_block_end(g); str_delete(loopvar); } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { writef(g, "~Mwhile (true)~N~{", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~Mbreak;~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~{"); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mint ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~{~Mint c = cursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->S[1] = p->mode == m_forward ? "> limit" : "< limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "c ~S1", p); } else { write_failure_if(g, "c ~S1 || c ~S2 cursor", p); } writef(g, "~Mcursor = c;~N", p); writef(g, "~}", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mslice_del();~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; writef(g, "~Mcursor = ~S0;~N", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "bra" : "ket"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "ket" : "bra"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Massign_to(~V);~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mslice_to(~V);~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varref(g, p->name); write_string(g, ".ToString()"); } } static void generate_insert(struct generator * g, struct node * p, int style) { write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~{~Mint c = cursor;~N"); writef(g, "~Minsert(cursor, cursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_stringassign(struct generator * g, struct node * p) { write_comment(g, p); int keep_c = p->mode == m_forward; /* like 'attach' */ if (keep_c) { writef(g, "~{~Mint c = cursor;~N", p); } if (p->mode == m_forward) { writef(g, "~Minsert(cursor, limit, ", p); } else { writef(g, "~Minsert(limit_backward, cursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) { w(g, "~Mcursor = c;~N~}"); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mslice_from("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * varname = vars_newname(g); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, q->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit;~N"); w(g, "~Mlimit = "); generate_AE(g, q->AE); w(g, ";~N"); w(g, "~M~B0 -= limit;~N"); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = "); generate_AE(g, q->AE); w(g, ";~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit - cursor;~N"); w(g, "~Mlimit = cursor;~N"); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } str_delete(savevar); } generate(g, p->aux); write_margin(g); write_str(g, g->failure_str); write_newline(g); str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~{~MEnv ~B0 = new Env(this);~N" "~Mcurrent = ~V;~N" "~Mcursor = 0;~N" "~Mlimit = current.Length;~N", p); if (p->left->possible_signals == -1) { /* Assume failure. */ w(g, "~Mbool ~B0_f = true;~N"); } generate(g, p->left); if (p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_f = false;~N"); } if (g->label_used) wsetl(g, g->failure_label); g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); writef(g, "~Mcopy_from(~B0);~N", p); if (p->left->possible_signals == 0) { // p->left always signals f. w(g, "~M~f~N"); } else if (p->left->possible_signals == -1) { write_failure_if(g, "~B0_f", p); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->S[0] = s; writef(g, "~M~V ~S0 ", p); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif ("); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, ";~N"); } else { w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~Mreturn ~V();~N", p); p->right = NULL; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~Mreturn ~V();~N", p); return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V();~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V();~N", p); write_failure(g); } else { writef(g, "~Mif (!~V())~N~+", p); write_failure(g); w(g, "~-"); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (tailcallable(g, p)) { writef(g, "~Mreturn (~S1_grouping~S0(~V, ~I0, ~I1, false) == 0);~N", p); p->right = NULL; } else { writef(g, "~Mif (~S1_grouping~S0(~V, ~I0, ~I1, false) != 0)~N~f", p); } } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn eq_s~S0(~V);~N", p); p->right = NULL; } else { write_failure_if(g, "!(eq_s~S0(~V))", p); } } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn eq_s~S0(~L);~N", p); p->right = NULL; } else { write_failure_if(g, "!(eq_s~S0(~L))", p); } } static void generate_define(struct generator * g, struct node * p) { write_newline(g); write_comment(g, p); struct name * q = p->name; if (q->type == t_routine) { g->S[0] = "private"; } else { if (SIZE(q->s) == 4 && memcmp(q->s, "stem", 4) == 0) { g->S[0] = "protected override"; } else { g->S[0] = "protected"; } } writef(g, "~M~S0 bool ~V()~N~M{~+~N", p); g->next_label = 0; g->var_number = 0; if (q->amongvar_needed) { w(g, "~Mint among_var;~N"); } /* Declare localised variables. */ for (struct name * name = g->analyser->names; name; name = name->next) { if (name->local_to == q) { switch (name->type) { case t_string: // String variables not localised for C# currently. w(g, "~M"); write_string(g, g->options->string_class); write_char(g, ' '); write_varname(g, name); write_string(g, " = new "); write_string(g, g->options->string_class); w(g, "();~N"); break; case t_integer: w(g, "~Mint "); write_varname(g, name); w(g, ";~N"); break; case t_boolean: w(g, "~Mbool "); write_varname(g, name); w(g, ";~N"); break; } } } str_clear(g->failure_str); g->failure_label = x_return; g->label_used = 0; /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (p->left->possible_signals) { generate(g, p->left->right); } } w(g, "~}"); } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Mamong_var = find_among~S0(a_~I0, ~F);~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~Mfind_among~S0(a_~I0, ~F);~N", p); } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~Mreturn find_among~S0(a_~I0, ~F) != 0;~N", p); x->node->right = NULL; } else { write_failure_if(g, "find_among~S0(a_~I0, ~F) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mswitch (among_var) {~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; /* Put a block around each case which seems to workaround bogus * C# compiler errors (typically with repeat reports at the same * location): * * dutchStemmer.generated.cs(543,25): error CS0165: Use of unassigned local variable `c5' * * The c5 variable is initialised at point of declaration and we * don't `goto` into the block it is declared in from outside so * this seems to be buggy code flow analysis in the C# compiler. * * Unclear where to usefully report mono bugs in 2025 so I've * not tried. */ w(g, "~Mcase ~I0: {~N~+"); generate(g, x->commands[i - 1]); w(g, "~Mbreak;~N~-~M}~N"); } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~Mreturn !~V;~N", p); } else { writef(g, "~Mreturn ~V;~N", p); } p->right = NULL; return; } if (inverted) { write_failure_if(g, "~V", p); } else { write_failure_if(g, "!~V", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } if (g->failure_label != a0) g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "#pragma warning disable 0164~N"); w(g, "#pragma warning disable 0162~N~N"); w(g, "~Mnamespace "); write_string(g, g->options->package); w(g, "~N~{"); w(g, "~Musing System;~N"); w(g, "~Musing System.Text;~N"); w(g, "~M~N"); w(g, "~M///~N"); w(g, "~M/// This class implements the stemming algorithm defined by a snowball script.~N"); w(g, "~M/// "); write_generated_comment_content(g); w(g, "~N" "~M///~N"); w(g, "~M/// ~N"); w(g, "~M[System.CodeDom.Compiler.GeneratedCode(\"Snowball\", \"" SNOWBALL_VERSION "\")]~N"); w(g, "~Mpublic partial class ~n : "); write_string(g, g->options->parent_class_name); w(g, "~N~{"); } static void generate_class_end(struct generator * g) { w(g, "~N"); w(g, "~}"); w(g, "~}"); w(g, "~N"); } static void generate_among_table(struct generator * g, struct among * x) { write_newline(g); write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Mprivate static readonly Among[] a_~I0 = new[] ~N~M{~N~+"); for (int i = 0; i < x->literalstring_count; i++) { if (i) w(g, ",~N"); g->I[3] = v[i].i; g->I[4] = v[i].result; g->I[5] = v[i].function_index; w(g, "~Mnew Among("); write_literal_string(g, v[i].b); w(g, ", ~I3, ~I4, ~I5)"); } w(g, "~N~-~M};~N"); if (x->function_count <= 1) return; w(g, "~N~Mprivate bool af_~I0() {~N~+"); w(g, "~Mswitch (af) {~N~+"); for (int n = 1; n <= x->function_count; n++) { w(g, "~Mcase "); write_int(g, n); w(g, ": return "); for (int i = 0; i < x->literalstring_count; i++) { if (v[i].function_index == n) { write_varref(g, v[i].function); w(g, "();~N"); break; } } } w(g, "~-~M}~N"); w(g, "~Mreturn false;~N"); w(g, "~-~M}~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } w(g, "~N"); } static void generate_grouping_table(struct generator * g, struct grouping * q) { symbol * b = q->b; w(g, "~Mprivate const string "); write_varname(g, q->name); write_string(g, " = "); write_literal_string(g, b); w(g, ";~N"); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { bool wrote_members = false; for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: w(g, "~Mprivate "); write_string(g, g->options->string_class); write_char(g, ' '); write_varname(g, q); write_string(g, " = new "); write_string(g, g->options->string_class); w(g, "();~N"); wrote_members = true; break; case t_integer: w(g, "~Mprivate int "); write_varname(g, q); w(g, ";~N"); wrote_members = true; break; case t_boolean: w(g, "~Mprivate bool "); write_varname(g, q); w(g, ";~N"); wrote_members = true; break; } } if (wrote_members) w(g, "~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); } } extern void generate_program_csharp(struct generator * g) { g->varname_prefix = "c"; g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "// ", NULL); generate_class_begin(g); generate_members(g); generate_groupings(g); generate_amongs(g); generate_methods(g); generate_class_end(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.1.0/compiler/generator_dart.c000066400000000000000000001231721520373054300203220ustar00rootroot00000000000000#include #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { // Relational operators are the same as C. write_c_relop(g, relop); } static void write_varname(struct generator * g, struct name * p) { if (p->type == t_external) { if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } } else { // We use the same naming scheme for both global and local variables. write_char(g, "SBIrxg"[p->type]); write_char(g, '_'); } write_s(g, p->s); } /* Reference to variable, e.g. when assigning to or using in an expression. */ static void write_varref(struct generator * g, struct name * p) { /* In Dart, references look just the same */ write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { write_char(g, '"'); for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 127) { if (ch == '"' || ch == '\\' || ch == '$') write_char(g, '\\'); write_char(g, ch); } else { write_string(g, "\\u"); write_hex4(g, ch); } } write_char(g, '"'); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p, NULL); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "limit - "; writef(g, "~Mint ~B0 = ~S1cursor;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "cursor = "); if (p->mode != m_forward) str_append_string(out, "limit - "); str_append(out, savevar); str_append_ch(out, ';'); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { w(g, "~Mlab"); write_int(g, n); w(g, ": while (true) {~+~N"); } static void wsetlab_end(struct generator * g, int n) { if (!g->unreachable) { w(g, "~Mbreak lab"); write_int(g, n); w(g, ";~N"); } else { g->unreachable = false; } w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "break lab"); write_int(g, n); write_char(g, ';'); write_newline(g); g->unreachable = true; } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "return false;"); break; default: write_string(g, "break lab"); write_int(g, g->failure_label); write_char(g, ';'); } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "cursor >= limit", p); } else { write_failure_if(g, "cursor <= limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'F': { struct among * x = NULL; if (p) x = p->among; if (x == NULL) { write_string(g, "null"); continue; } if (x->function_count == 0) { write_string(g, "null"); continue; } if (x->function_count == 1) { struct amongvec * v = x->b; int found = 0; for (int j = 0; j < x->literalstring_count; j++) { if (v[j].function) { write_varref(g, v[j].function); found = 1; break; } } if (!found) { fprintf(stderr, "function_count == 1 but no among functions\n"); exit(1); } continue; } w(g, "af_"); write_int(g, x->number); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': write_varref(g, p->name); continue; case 'W': write_varname(g, p->name); continue; case 'L': write_literal_string(g, p->literalstring); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_s(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "9007199254740992"); break; case c_minint: write_string(g, "-9007199254740992"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_divide: s = " ~/ "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "cursor"); break; case c_limit: w(g, p->mode == m_forward ? "limit" : "limit_backward"); break; case c_len: /* Same as size() for Dart. */ case c_size: w(g, "current.length"); break; case c_lenof: /* Same as sizeof() for Dart. */ case c_sizeof: writef(g, "~V.length", p); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); bool end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (savevar && K_needed_node_on_f(p)) { write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g, g->failure_label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); } p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mlimit_backward = cursor;~N" "~Mcursor = limit;~N", p); generate(g, p->left); w(g, "~Mcursor = limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g, g->failure_label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); writef(g, "~M~V();~N", p->left); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g, g->failure_label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!go_~S1_grouping~S0(~V, ~I0, ~I1)", p); if (!is_goto) { if (p->mode == m_forward) w(g, "~Mcursor++;~N"); else w(g, "~Mcursor--;~N"); } } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = false; int golab = new_label(g); g->I[0] = golab; w(g, "~Mgolab~I0: while(true)~N"); w(g, "~{"); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; g->unreachable = false; } else { /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak golab~I0;~N"); g->unreachable = true; } wsetlab_end(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mfor (int ~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--)~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { int replab = new_label(g); w(g, "~Mreplab"); write_int(g, replab); writef(g, ": while(true)~N~{", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue replab"); write_int(g, replab); w(g, ";~N"); g->unreachable = true; } wsetlab_end(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~Mbreak replab"); write_int(g, replab); w(g, ";~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~{"); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mint ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~{~Mint c = cursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->S[1] = p->mode == m_forward ? "> limit" : "< limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "c ~S1", p); } else { write_failure_if(g, "c ~S1 || c ~S2 cursor", p); } writef(g, "~Mcursor = c;~N", p); writef(g, "~}", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mslice_del();~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; writef(g, "~Mcursor = ~S0;~N", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "bra" : "ket"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "ket" : "bra"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = assign_to();~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = slice_to();~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~{~Mint c = cursor;~N"); writef(g, "~Minsert(cursor, cursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_stringassign(struct generator * g, struct node * p) { write_comment(g, p); int keep_c = p->mode == m_forward; /* like 'attach' */ if (keep_c) { writef(g, "~{~Mint c = cursor;~N", p); } if (p->mode == m_forward) { writef(g, "~Minsert(cursor, limit, ", p); } else { writef(g, "~Minsert(limit_backward, cursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) { w(g, "~Mcursor = c;~N~}"); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mslice_from("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * varname = vars_newname(g); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, q->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit;~N"); w(g, "~Mlimit = "); generate_AE(g, q->AE); w(g, ";~N"); w(g, "~M~B0 -= limit;~N"); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = "); generate_AE(g, q->AE); w(g, ";~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit - cursor;~N"); w(g, "~Mlimit = cursor;~N"); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~{~N" "~Mfinal SnowballProgram ~B0 = SnowballProgram.from(this);~N", p); writef(g, "~Mcurrent = ~V;~N" "~Mcursor = 0;~N" "~Mlimit = current.length;~N", p); if (p->left->possible_signals == -1) { /* Assume failure. */ w(g, "~Mbool ~B0_f = true;~N"); } wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable && p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_f = false;~N"); } wsetlab_end(g, g->failure_label); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); writef(g, "~M~V = current;~N" "~Mcopy_from(~B0);~N", p); if (p->left->possible_signals == 0) { // p->left always signals f. write_failure(g); } else if (p->left->possible_signals == -1) { write_failure_if(g, "~B0_f", p); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->S[0] = s; writef(g, "~M~V ~S0 ", p); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif ("); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, ";~N"); } else { w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~Mreturn ~V();~N", p); p->right = NULL; g->unreachable = true; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~Mreturn ~V();~N", p); g->unreachable = true; return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V();~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V();~N", p); write_failure(g); } else { write_failure_if(g, "!~V()", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (tailcallable(g, p)) { writef(g, "~Mreturn ~S1_grouping~S0(~V, ~I0, ~I1);~N", p); p->right = NULL; } else { write_failure_if(g, "!(~S1_grouping~S0(~V, ~I0, ~I1))", p); } } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn eq_s~S0(~V);~N", p); p->right = NULL; } else { write_failure_if(g, "!(eq_s~S0(~V))", p); } } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn eq_s~S0(~L);~N", p); p->right = NULL; } else { write_failure_if(g, "!(eq_s~S0(~L))", p); } } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; write_newline(g); write_comment(g, p); if (q->type != t_routine) { if (SIZE(q->s) == 4 && memcmp(q->s, "stem", 4) == 0) { w(g, "~M@override~N"); } } writef(g, "~Mbool ~V() {~+~N", p); g->next_label = 0; g->var_number = 0; if (q->amongvar_needed) { w(g, "~Mint among_var;~N"); } /* Declare localised variables. */ for (struct name * name = g->analyser->names; name; name = name->next) { if (name->local_to == q) { switch (name->type) { case t_string: w(g, "~MString "); write_varname(g, name); w(g, ";~N"); break; case t_integer: w(g, "~Mint "); write_varname(g, name); w(g, ";~N"); break; case t_boolean: w(g, "~Mbool "); write_varname(g, name); w(g, ";~N"); break; } } } str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (p->left->possible_signals) { generate(g, p->left->right); } } w(g, "~}"); } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Mamong_var = find_among~S0(a_~I0, ~F);~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~Mfind_among~S0(a_~I0, ~F);~N", p); } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~Mreturn find_among~S0(a_~I0, ~F) != 0;~N", p); x->node->right = NULL; g->unreachable = true; } else { write_failure_if(g, "find_among~S0(a_~I0, ~F) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mswitch (among_var) {~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); if (!g->unreachable) w(g, "~Mbreak;~N"); w(g, "~-"); g->unreachable = false; } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~Mreturn !~V;~N", p); } else { writef(g, "~Mreturn ~V;~N", p); } p->right = NULL; g->unreachable = true; return; } if (inverted) { write_failure_if(g, "~V", p); } else { write_failure_if(g, "!~V", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "~/="); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "// ignore_for_file: non_constant_identifier_names, slash_for_doc_comments, camel_case_types~N~N"); w(g, "import '../src/snowball.dart';~N~N"); w(g, "/**~N" " * This class implements the stemming algorithm defined by a snowball script.~N" " *~N" " * "); write_generated_comment_content(g); w(g, "~N */~N" "class ~n extends "); write_string(g, g->options->parent_class_name); w(g, " {~+~N"); } static void generate_class_end(struct generator * g) { w(g, "~-~M~N"); w(g, "}~N"); } static void generate_equals(struct generator * g) { w(g, "~N" "~M@override~N" "~Mbool operator ==(Object other) => other is "); write_s(g, g->options->name); w(g, ";~N~N" "~M@override~N" "~Mint get hashCode => runtimeType.toString().hashCode;~N"); } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Mstatic final a_~I0 = [~N~+"); for (int i = 0; i < x->literalstring_count; i++) { if (i) w(g, ",~N"); g->I[3] = v[i].i; g->I[4] = v[i].result; g->I[5] = v[i].function_index; w(g, "~MAmong("); write_literal_string(g, v[i].b); w(g, ", ~I3, ~I4, ~I5)"); } w(g, "~N~-~M];~N~N"); if (x->function_count <= 1) return; w(g, "~N~Mbool af_~I0() {~N~+"); w(g, "~Mswitch (af) {~N~+"); for (int n = 1; n <= x->function_count; n++) { w(g, "~Mcase "); write_int(g, n); w(g, ": return "); for (int i = 0; i < x->literalstring_count; i++) { if (v[i].function_index == n) { write_varref(g, v[i].function); w(g, "();~N"); break; } } } w(g, "~-~M}~N"); w(g, "~Mreturn false;~N"); w(g, "~-~M}~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i >> 3] |= 1 << (i & 7); } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7) / 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); w(g, "~Mstatic final "); write_varname(g, q->name); write_string(g, " = String.fromCharCodes(["); for (int i = 0; i < size; i++) { if (i) w(g, ", "); write_int(g, map[i]); } w(g, "]);~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { bool wrote_members = false; for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: w(g, "~MString "); write_varname(g, q); w(g, " = '';~N"); wrote_members = true; break; case t_integer: w(g, "~Mint "); write_varname(g, q); w(g, " = 0;~N"); wrote_members = true; break; case t_boolean: w(g, "~Mbool "); write_varname(g, q); w(g, " = false;~N"); wrote_members = true; break; } } if (wrote_members) w(g, "~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_dart(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "// ", NULL); generate_class_begin(g); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); generate_equals(g); generate_class_end(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.1.0/compiler/generator_go.c000066400000000000000000001216451520373054300200000ustar00rootroot00000000000000#include #include /* for toupper */ #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { // Relational operators are the same as C. write_c_relop(g, relop); } static void write_varname(struct generator * g, struct name * p) { // We use the same naming scheme for both global and local variables. write_char(g, "SbirrG"[p->type]); write_char(g, '_'); write_s(g, p->s); } /* Reference to variable, e.g. when assigning to or using in an expression. */ static void write_varref(struct generator * g, struct name * p) { if (p->type >= t_routine || p->local_to == NULL) { write_string(g, "context."); } write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { int i = 0; write_char(g, '"'); while (i < SIZE(p)) { int ch; i += get_utf8(p + i, &ch); // Write out ASCII and lower Unicode printables as literal characters. // Use escapes for anything over 0x590 as a crude way to avoid LTR // characters affecting the rendering of source character order in // confusing ways. if ((32 <= ch && ch < 127) || (0xa0 < ch && ch < 0x590)) { if (ch == '"' || ch == '\\') write_char(g, '\\'); write_wchar_as_utf8(g, ch); } else { write_string(g, "\\u"); write_hex4(g, ch); } } write_char(g, '"'); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; /* FIXME could use Go //line syntax if we had original filename */ write_margin(g); write_string(g, "// "); write_comment_content(g, p, NULL); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~+{~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "env.Limit - "; writef(g, "~Mvar ~B0 = ~S1env.Cursor~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "env.Cursor = "); if (p->mode != m_forward) str_append_string(out, "env.Limit - "); str_append(out, savevar); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "env.NextChar()" : "env.PrevChar()"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { g->I[0] = n; w(g, "~-~Mlab~I0:~N~+~Mfor {~N~+"); } static void wsetlab_end(struct generator * g, int n) { if (!g->unreachable) { g->I[0] = n; w(g, "~Mbreak lab~I0~N"); } w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { g->I[0] = n; w(g, "~Mbreak lab~I0~N"); g->unreachable = true; } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } switch (g->failure_label) { case x_return: w(g, "~Mreturn false"); break; default: w(g, "~Mbreak lab"); write_int(g, g->failure_label); } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "env.Cursor >= env.Limit", p); } else { write_failure_if(g, "env.Cursor <= env.LimitBackward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'E': { // Write an external name. if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } char save_initial = p->name->s[0]; p->name->s[0] = toupper(save_initial); write_s(g, p->name->s); p->name->s[0] = save_initial; continue; } case 'V': write_varref(g, p->name); continue; case 'W': write_varname(g, p->name); continue; case 'L': write_literal_string(g, p->literalstring); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_s(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "snowballRuntime.MaxInt"); break; case c_minint: write_string(g, "snowballRuntime.MinInt"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_divide: s = " / "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "env.Cursor"); break; case c_limit: w(g, p->mode == m_forward ? "env.Limit" : "env.LimitBackward"); break; case c_len: w(g, "snowballRuntime.RuneCountInString(env.Current())"); break; case c_size: w(g, "len(env.Current())"); break; case c_lenof: writef(g, "snowballRuntime.RuneCountInString(~V)", p); break; case c_sizeof: writef(g, "len(~V)", p); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); bool end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (savevar && K_needed_node_on_f(p)) { write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g, label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g, label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); } p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g,"~Menv.LimitBackward = env.Cursor~N" "~Menv.Cursor = env.Limit~N", p); generate(g, p->left); w(g, "~Menv.Cursor = env.LimitBackward~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g, label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = true~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = false~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); writef(g, "~M~W(env, context)~N", p->left); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "B"; g->S[1] = complement ? "In" : "Out"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!env.Go~S1Grouping~S0(~W, ~I0, ~I1)", p); if (!is_goto) { w(g, p->mode == m_forward ? "~Menv.NextChar()~N" : "~Menv.PrevChar()~N"); } } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = false; int golab = new_label(g); g->I[0] = golab; w(g, "~-~Mgolab~I0:~N~+~Mfor {~N~+"); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak golab~I0~N"); g->unreachable = true; } wsetlab_end(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); w(g, "~Mfor _ = range make([]struct{}, "); generate_AE(g, p->AE); writef(g, ") {~N~+", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { int replab = new_label(g); g->I[0] = replab; writef(g, "~-~Mreplab~I0:~N~+~Mfor {~N~+", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); g->I[0] = g->failure_label; w(g, "~-~Mlab~I0:~N~+~Mfor range [2]struct{}{} {~N~+"); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0--~N"); } g->I[0] = replab; w(g, "~Mcontinue replab~I0~N"); } w(g, "~-~M}~N"); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->I[0] = replab; w(g, "~Mbreak replab~I0~N~-~M}~N"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mvar ~B0 = "); generate_AE(g, p->AE); w(g, "~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif env.Cursor ~S0 "); generate_AE(g, p->AE); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Menv.Cursor = "); generate_AE(g, p->AE); writef(g, "~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); // Generate the AE to a temporary block so we can substitute it in // write_failure_if(). struct str * ae = str_new(); struct str * s = g->outbuf; g->outbuf = ae; generate_AE(g, p->AE); g->outbuf = s; g->B[0] = str_data(ae); g->S[0] = p->mode == m_forward ? "" : "Back"; g->S[1] = p->AE->type == c_number ? "" : "Checked"; write_failure_if(g, "!env.Hop~S0~S1(~B0)", p); str_delete(ae); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Menv.SliceDel()~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.Limit" : "env.LimitBackward"; writef(g, "~Menv.Cursor = ~S0~N", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.Bra" : "env.Ket"; writef(g, "~M~S0 = env.Cursor~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.Ket" : "env.Bra"; writef(g, "~M~S0 = env.Cursor~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = env.AssignTo()~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = env.SliceTo()~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { write_margin(g); write_block_start(g); write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~Mvar c = env.Cursor~N"); w(g, "~Mbra, ket := env.Cursor, env.Cursor~N"); writef(g, "~Menv.Insert(bra, ket, ", p); generate_address(g, p); writef(g, ")~N", p); if (keep_c) w(g, "~Menv.Cursor = c~N"); write_block_end(g); } static void generate_stringassign(struct generator * g, struct node * p) { write_margin(g); write_block_start(g); write_comment(g, p); int keep_c = p->mode == m_forward; /* like 'attach' */ if (keep_c) { writef(g, "~Mvar c = env.Cursor~N", p); } if (p->mode == m_forward) { writef(g, "~Menv.Insert(env.Cursor, env.Limit, ", p); } else { writef(g, "~Menv.Insert(env.LimitBackward, env.Cursor, ", p); } generate_address(g, p); writef(g, ")~N", p); if (keep_c) { w(g, "~Menv.Cursor = c~N"); } write_block_end(g); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Menv.SliceFrom("); generate_address(g, p); writef(g, ")~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * varname = vars_newname(g); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif env.Cursor ~S0 "); generate_AE(g, q->AE); w(g, " "); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mvar ~B0 = env.Limit~N"); w(g, "~Menv.Limit = "); generate_AE(g, q->AE); w(g, "~N"); w(g, "~M~B0 -= env.Limit~N"); } else { w(g, "~Mvar ~B0 = env.LimitBackward~N"); w(g, "~Menv.LimitBackward = "); generate_AE(g, q->AE); w(g, "~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "env.Limit += "); str_append(g->failure_str, varname); } else { str_assign(g->failure_str, "env.LimitBackward = "); str_append(g->failure_str, varname); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mvar ~B0 = env.Limit - env.Cursor~N"); w(g, "~Menv.Limit = env.Cursor~N"); } else { w(g, "~Mvar ~B0 = env.LimitBackward~N"); w(g, "~Menv.LimitBackward = env.Cursor~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "env.Limit += "); str_append(g->failure_str, varname); } else { str_assign(g->failure_str, "env.LimitBackward = "); str_append(g->failure_str, varname); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~Mvar ~B0 = env.Clone()~N" "~Menv.SetCurrent(~V)~N", p); if (p->left->possible_signals == -1) { /* Assume failure. */ w(g, "~Mvar ~B0_f = true~N"); } wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable && p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_f = false~N"); } wsetlab_end(g, g->failure_label); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); /* Update string variable; restore env. */ writef(g, "~M~V = env.Current()~N" "~M*env = *~B0~N", p); if (p->left->possible_signals == 0) { // p->left always signals f. write_failure(g); } else if (p->left->possible_signals == -1) { write_failure_if(g, "~B0_f", p); } str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->S[0] = s; writef(g, "~M~V ~S0 ", p); generate_AE(g, p->AE); w(g, "~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif "); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, "~N"); } else { write_char(g, ' '); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~Mreturn ~W(env, context)~N", p); p->right = NULL; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~Mreturn ~W(env, context)~N", p); return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~W(env, context)~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~W(env, context)~N", p); write_failure(g); } else { write_failure_if(g, "!~W(env, context)", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "B"; g->S[1] = complement ? "Out" : "In"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (tailcallable(g, p)) { writef(g, "~Mreturn env.~S1Grouping~S0(~W, ~I0, ~I1);~N", p); p->right = NULL; } else { write_failure_if(g, "!env.~S1Grouping~S0(~W, ~I0, ~I1)", p); } } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "B"; if (tailcallable(g, p)) { writef(g, "~Mreturn env.EqS~S0(~V)~N", p); p->right = NULL; } else { write_failure_if(g, "!env.EqS~S0(~V)", p); } } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "B"; if (tailcallable(g, p)) { writef(g, "~Mreturn env.EqS~S0(~L)~N", p); p->right = NULL; } else { write_failure_if(g, "!env.EqS~S0(~L)", p); } } static void generate_setup_context(struct generator * g) { if (g->analyser->variable_count == 0) { w(g, "~Mvar context = &Context{}~N"); w(g, "~M_ = context~N"); return; } w(g, "~Mvar context = &Context{~+~N"); for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: write_margin(g); write_varname(g, q); w(g, ": \"\",~N"); break; case t_integer: write_margin(g); write_varname(g, q); w(g, ": 0,~N"); break; case t_boolean: write_margin(g); write_varname(g, q); w(g, ": false,~N"); break; } } w(g, "~-~M}~N"); w(g, "~M_ = context~N"); } static void generate_define(struct generator * g, struct node * p) { write_newline(g); write_comment(g, p); struct name * q = p->name; if (q->type == t_routine) { writef(g, "~Mfunc ~W(env *snowballRuntime.Env, ctx interface{}) bool {~+~N", p); w(g, "~Mcontext := ctx.(*Context)~N"); w(g, "~M_ = context~N"); } else { writef(g, "~Mfunc ~E(env *snowballRuntime.Env) bool {~+~N", p); generate_setup_context(g); if (q->used != q->definition) { // This external needs to be callable as a routine, so generate // the actual code like a routine with an external which just // forwards to that. writef(g, "~Mreturn ~W(env, context)~N", p); w(g, "~-~M}~N"); writef(g, "~Mfunc ~W(env *snowballRuntime.Env, ctx interface{}) bool {~+~N", p); w(g, "~Mcontext := ctx.(*Context)~N"); w(g, "~M_ = context~N"); } } if (q->amongvar_needed) { w(g, "~Mvar among_var int32~N"); } /* Declare localised variables. */ for (struct name * name = g->analyser->names; name; name = name->next) { if (name->local_to == q) { switch (name->type) { case t_string: w(g, "~Mvar "); write_varname(g, name); w(g, " string~N"); break; case t_integer: w(g, "~Mvar "); write_varname(g, name); w(g, " int~N"); break; case t_boolean: w(g, "~Mvar "); write_varname(g, name); w(g, " bool~N"); break; } } } g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (p->left->possible_signals) { generate(g, p->left->right); } } w(g, "~-~M}~N"); } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "B"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Mamong_var = env.FindAmong~S0(A_~I0, context)~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~Menv.FindAmong~S0(A_~I0, context)~N", p); } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~Mreturn env.FindAmong~S0(A_~I0, context) != 0~N", p); x->node->right = NULL; g->unreachable = true; } else { write_failure_if(g, "env.FindAmong~S0(A_~I0, context) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mswitch among_var {~N"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); w(g, "~-"); g->unreachable = false; } w(g, "~M}~N"); } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~Mreturn !~V~N", p); } else { writef(g, "~Mreturn ~V~N", p); } p->right = NULL; return; } if (inverted) { write_failure_if(g, "~V", p); } else { write_failure_if(g, "!~V", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Menv.Debug(~I0, ~I1)~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "package "); write_string(g, g->options->package); w(g, "~N~N"); w(g, "import (~N"); w(g, "~+~MsnowballRuntime \""); write_string(g, g->options->go_snowball_runtime); w(g, "\"~N~-)~N~N"); } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Mvar A_~I0 = []*snowballRuntime.Among{~N~+"); for (int i = 0; i < x->literalstring_count; i++) { g->I[0] = v[i].i; g->I[1] = v[i].result; w(g, "~M&snowballRuntime.Among{Str: "); write_literal_string(g, v[i].b); w(g, ", A: ~I0, B: ~I1, F: "); if (v[i].function != NULL) { write_varname(g, v[i].function); } else { w(g, "nil"); } w(g, "},~N"); } w(g, "~-~M}~N~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i >> 3] |= 1 << (i & 7); } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7) / 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); w(g, "~Mvar "); write_varname(g, q->name); w(g, " = []byte{"); for (int i = 0; i < size; i++) { if (i) w(g, ", "); write_int(g, map[i]); } w(g, "}~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { w(g, "type Context struct {~+~N"); for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: write_margin(g); write_varname(g, q); w(g, " string~N"); break; case t_integer: write_margin(g); write_varname(g, q); w(g, " int~N"); break; case t_boolean: write_margin(g); write_varname(g, q); w(g, " bool~N"); break; } } w(g, "~-}~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_go(struct generator * g) { g->margin_indent = "\t"; g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "//! ", NULL); generate_class_begin(g); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.1.0/compiler/generator_java.c000066400000000000000000001237331520373054300203140ustar00rootroot00000000000000#include #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { // Relational operators are the same as C. write_c_relop(g, relop); } static void write_varname(struct generator * g, struct name * p) { if (p->type == t_external) { if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } } else { // We use the same naming scheme for both global and local variables. write_char(g, "SBIrxg"[p->type]); write_char(g, '_'); } write_s(g, p->s); } /* Reference to variable, e.g. when assigning to or using in an expression. */ static void write_varref(struct generator * g, struct name * p) { /* In Java, references look just the same */ write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { write_char(g, '"'); for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\') write_char(g, '\\'); write_char(g, ch); } else if (ch < 128) { // Escape as octal. write_char(g, '\\'); write_char(g, '0' + ((ch >> 6) & 0x03)); write_char(g, '0' + ((ch >> 3) & 0x07)); write_char(g, '0' + (ch & 0x07)); } else { write_string(g, "\\u"); write_hex4(g, ch); } } write_char(g, '"'); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p, NULL); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "limit - "; writef(g, "~Mint ~B0 = ~S1cursor;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "cursor = "); if (p->mode != m_forward) str_append_string(out, "limit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { w(g, "~Mlab"); write_int(g, n); w(g, ": {~+~N"); } static void wsetlab_end(struct generator * g) { w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "break lab"); write_int(g, n); write_char(g, ';'); write_newline(g); g->unreachable = true; } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "return false;"); break; default: write_string(g, "break lab"); write_int(g, g->failure_label); write_char(g, ';'); } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "cursor >= limit", p); } else { write_failure_if(g, "cursor <= limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': write_varref(g, p->name); continue; case 'W': write_varname(g, p->name); continue; case 'L': write_literal_string(g, p->literalstring); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_s(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "Integer.MAX_VALUE"); break; case c_minint: write_string(g, "Integer.MIN_VALUE"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_divide: s = " / "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "cursor"); break; case c_limit: w(g, p->mode == m_forward ? "limit" : "limit_backward"); break; case c_len: /* Same as size() for Java. */ case c_size: w(g, "length"); break; case c_lenof: /* Same as sizeof() for Java. */ case c_sizeof: writef(g, "L~V", p); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); bool end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (savevar && K_needed_node_on_f(p)) { write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g, g->failure_label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); } p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mlimit_backward = cursor;~N" "~Mcursor = limit;~N", p); generate(g, p->left); w(g, "~Mcursor = limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); writef(g, "~M~V();~N", p->left); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!go_~S1_grouping~S0(~V, ~I0, ~I1)", p); if (!is_goto) { if (p->mode == m_forward) w(g, "~Mcursor++;~N"); else w(g, "~Mcursor--;~N"); } } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = false; int golab = new_label(g); g->I[0] = golab; w(g, "~Mgolab~I0: while (true)~N"); w(g, "~{"); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak golab~I0;~N"); } g->unreachable = false; wsetlab_end(g); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mfor (int ~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--)~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { writef(g, "~Mwhile (true)~N~{", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); } wsetlab_end(g); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~Mbreak;~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~{"); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mint ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~{~Mint c = cursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->S[1] = p->mode == m_forward ? "> limit" : "< limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "c ~S1", p); } else { write_failure_if(g, "c ~S1 || c ~S2 cursor", p); } writef(g, "~Mcursor = c;~N", p); writef(g, "~}", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mslice_del();~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; writef(g, "~Mcursor = ~S0;~N", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "bra" : "ket"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "ket" : "bra"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mif (~V.length < limit) {~N~+", p); writef(g, "~M~V = Arrays.copyOf(current, limit);~N~-", p); writef(g, "~M} else {~N~+", p); writef(g, "~MSystem.arraycopy(current, 0, ~V, 0, limit);~N~-", p); writef(g, "~M}~N", p); writef(g, "~ML~V = limit;~N", p); g->java_import_arrays = true; } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mslice_check();~N", p); writef(g, "~Mif (~V.length < ket - bra) {~N~+", p); writef(g, "~M~V = Arrays.copyOfRange(current, bra, ket);~N~-", p); writef(g, "~M} else {~N~+", p); writef(g, "~MSystem.arraycopy(current, bra, ~V, 0, ket - bra);~N~-", p); writef(g, "~M}~N", p); writef(g, "~ML~V = ket - bra;~N", p); g->java_import_arrays = true; } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_string(g, "new CharArraySequence("); write_varref(g, p->name); write_string(g, ", L"); write_varref(g, p->name); write_string(g, ")"); g->java_import_chararraysequence = true; } } static void generate_insert(struct generator * g, struct node * p, int style) { write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~{~Mint c = cursor;~N"); writef(g, "~Minsert(cursor, cursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_stringassign(struct generator * g, struct node * p) { write_comment(g, p); int keep_c = p->mode == m_forward; /* like 'attach' */ if (keep_c) { writef(g, "~{~Mint c = cursor;~N", p); } if (p->mode == m_forward) { writef(g, "~Minsert(cursor, limit, ", p); } else { writef(g, "~Minsert(limit_backward, cursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) { w(g, "~Mcursor = c;~N~}"); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mslice_from("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * varname = vars_newname(g); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, q->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit;~N"); w(g, "~Mlimit = "); generate_AE(g, q->AE); w(g, ";~N"); w(g, "~M~B0 -= limit;~N"); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = "); generate_AE(g, q->AE); w(g, ";~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit - cursor;~N"); w(g, "~Mlimit = cursor;~N"); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~{~N" "~MSnowballProgram ~B0 = new SnowballProgram(this);~N", p); writef(g, "~Mcurrent = ~V;~N" "~Mcursor = 0;~N" "~Mlength = L~V;~N" "~Mlimit = length;~N", p); if (p->left->possible_signals == -1) { /* Assume failure. */ w(g, "~Mboolean ~B0_f = true;~N"); } wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable && p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_f = false;~N"); } wsetlab_end(g); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); writef(g, "~ML~W = length;~N" "~Mcopy_from(~B0);~N", p); ++g->copy_from_count; if (p->left->possible_signals == 0) { // p->left always signals f. write_failure(g); } else if (p->left->possible_signals == -1) { write_failure_if(g, "~B0_f", p); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->S[0] = s; writef(g, "~M~V ~S0 ", p); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif ("); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, ";~N"); } else { w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~Mreturn ~V();~N", p); p->right = NULL; g->unreachable = true; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~Mreturn ~V();~N", p); g->unreachable = true; return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V();~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V();~N", p); write_failure(g); } else { write_failure_if(g, "!~V()", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (tailcallable(g, p)) { writef(g, "~Mreturn ~S1_grouping~S0(~V, ~I0, ~I1);~N", p); p->right = NULL; } else { write_failure_if(g, "!(~S1_grouping~S0(~V, ~I0, ~I1))", p); } } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn eq_s~S0(new CharArraySequence(~V, L~V));~N", p); p->right = NULL; } else { write_failure_if(g, "!(eq_s~S0(new CharArraySequence(~V, L~V)))", p); } g->java_import_chararraysequence = true; } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn eq_s~S0(~L);~N", p); p->right = NULL; } else { write_failure_if(g, "!(eq_s~S0(~L))", p); } } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; write_newline(g); write_comment(g, p); if (q->type == t_routine) { g->S[0] = "private"; } else { if (SIZE(q->s) == 4 && memcmp(q->s, "stem", 4) == 0) { w(g, "~M@Override~N"); } g->S[0] = "public"; } writef(g, "~M~S0 boolean ~V() {~+~N", p); g->next_label = 0; g->var_number = 0; if (q->amongvar_needed) { w(g, "~Mint among_var;~N"); } /* Declare localised variables. */ for (struct name * name = g->analyser->names; name; name = name->next) { if (name->local_to == q) { switch (name->type) { case t_string: assert(0); break; case t_integer: w(g, "~Mint "); write_varname(g, name); w(g, ";~N"); break; case t_boolean: w(g, "~Mboolean "); write_varname(g, name); w(g, ";~N"); break; } } } str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (p->left->possible_signals) { generate(g, p->left->right); } } w(g, "~}"); } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Mamong_var = find_among~S0(a_~I0);~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~Mfind_among~S0(a_~I0);~N", p); } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~Mreturn find_among~S0(a_~I0) != 0;~N", p); x->node->right = NULL; g->unreachable = true; } else { write_failure_if(g, "find_among~S0(a_~I0) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mswitch (among_var) {~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); if (!g->unreachable) w(g, "~Mbreak;~N"); w(g, "~-"); g->unreachable = false; } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~Mreturn !~V;~N", p); } else { writef(g, "~Mreturn ~V;~N", p); } p->right = NULL; g->unreachable = true; return; } if (inverted) { write_failure_if(g, "~V", p); } else { write_failure_if(g, "!~V", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "package "); write_string(g, g->options->package); w(g, ";~N~N"); if (g->java_import_arrays) { w(g, "import java.util.Arrays;~N~N"); } if (g->analyser->amongs) { w(g, "import "); write_string(g, g->options->among_class); w(g, ";~N~N"); } if (g->java_import_chararraysequence) { w(g, "import org.tartarus.snowball.CharArraySequence;~N~N"); } if (g->copy_from_count > 0) { w(g, "import org.tartarus.snowball.SnowballProgram;~N~N"); } w(g, "/**~N" " * This class implements the stemming algorithm defined by a snowball script.~N" " *

~N" " * "); write_generated_comment_content(g); w(g, "~N" " *

~N" " */~N" "@SuppressWarnings(\"unused\")~N" "public class ~n extends "); write_string(g, g->options->parent_class_name); w(g, " {~+~N"); if (g->analyser->among_with_function_count > 0) { w(g, "~Mprivate static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup();~N"); } write_newline(g); } static void generate_class_end(struct generator * g) { w(g, "~N}"); w(g, "~N"); } static void generate_equals(struct generator * g) { w(g, "~N" "~M@Override~N" "~Mpublic boolean equals( Object o ) {~N" "~+~Mreturn o instanceof "); write_s(g, g->options->name); w(g, ";~N~-~M}~N" "~N" "~M@Override~N" "~Mpublic int hashCode() {~N" "~+~Mreturn "); write_s(g, g->options->name); w(g, ".class.getName().hashCode();~N" "~-~M}~N"); } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Mprivate final static Among[] a_~I0 = {~N~+"); for (int i = 0; i < x->literalstring_count; i++) { if (i) w(g, ",~N"); g->I[0] = v[i].i; g->I[1] = v[i].result; w(g, "~Mnew Among("); write_literal_string(g, v[i].b); w(g, ", ~I0, ~I1"); if (v[i].function != NULL) { w(g, ", \""); write_varname(g, v[i].function); w(g, "\", methodObject"); } w(g, ")"); } w(g, "~N~-~M};~N~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i >> 3] |= 1 << (i & 7); } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7) / 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); w(g, "~Mprivate static final char[] "); write_varname(g, q->name); write_string(g, " = {"); for (int i = 0; i < size; i++) { if (i) w(g, ", "); write_int(g, map[i]); } w(g, " };~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { bool wrote_members = false; for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: w(g, "~Mprivate char[] "); write_varname(g, q); w(g, " = new char[8];~N"); w(g, "~Mprivate int L"); write_varname(g, q); w(g, " = 0;~N"); wrote_members = true; break; case t_integer: w(g, "~Mprivate int "); write_varname(g, q); w(g, ";~N"); wrote_members = true; break; case t_boolean: w(g, "~Mprivate boolean "); write_varname(g, q); w(g, ";~N"); wrote_members = true; break; } } if (wrote_members) w(g, "~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_java(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); w(g, "~+"); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); generate_equals(g); generate_class_end(g); w(g, "~-"); { /* We need to call generate_class_begin() after we've generated the * methods so we know if copy_from_count > 0. */ struct str * body = g->outbuf; g->outbuf = str_new(); write_start_comment(g, "// ", NULL); generate_class_begin(g); str_append(g->outbuf, body); str_delete(body); } output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.1.0/compiler/generator_js.c000066400000000000000000001332531520373054300200050ustar00rootroot00000000000000#include #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { // Relational operators are the same as C, save for (in-)equality. switch (relop) { case c_eq: write_string(g, " === "); break; case c_ne: write_string(g, " !== "); break; default: write_c_relop(g, relop); } } static void write_varname(struct generator * g, struct name * p) { if (p->type == t_external) { if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } } else { // We use the same naming scheme for both global and local variables. write_char(g, "SBIrxg"[p->type]); write_char(g, '_'); } write_s(g, p->s); } /* Reference to variable, e.g. when assigning to or using in an expression. */ static void write_varref(struct generator * g, struct name * p) { if (p->type != t_grouping && p->local_to == NULL) { w(g, "this.#"); } write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { write_char(g, '"'); for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\') write_char(g, '\\'); write_char(g, ch); } else { write_string(g, "\\u"); write_hex4(g, ch); } } write_char(g, '"'); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p, NULL); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "this.limit - "; writef(g, "~Mconst /** number */ ~B0 = ~S1this.c;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "this.c = "); if (p->mode != m_forward) str_append_string(out, "this.limit - "); str_append(out, savevar); str_append_ch(out, ';'); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "this.c++;" : "this.c--;"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { g->I[0] = n; // This label may end up unused, but we can't easily tell at this point. w(g, "~M// deno-lint-ignore no-unused-labels~N"); w(g, "~Mlab~I0: {~N~+"); } static void wsetlab_end(struct generator * g) { w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "break lab"); write_int(g, n); write_char(g, ';'); write_newline(g); g->unreachable = true; } static void write_failure_(struct generator * g, byte after_if) { if (!after_if) write_margin(g); if (str_len(g->failure_str) != 0) { if (after_if) w(g, "{~N~+~M"); write_str(g, g->failure_str); write_newline(g); write_margin(g); } switch (g->failure_label) { case x_return: write_string(g, "return false;"); break; default: write_string(g, "break lab"); write_int(g, g->failure_label); write_char(g, ';'); } write_newline(g); g->unreachable = true; if (after_if && str_len(g->failure_str) != 0) { w(g, "~-~M}~N"); } } static void write_failure(struct generator * g) { write_failure_(g, false); } static void write_failure_after_if(struct generator * g) { write_failure_(g, true); } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ") ", p); write_failure_after_if(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "this.c >= this.limit", p); } else { write_failure_if(g, "this.c <= this.limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_failure(g); g->unreachable = false; continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'F': { // Among function dispatcher. struct among * x = p->among; if (x->function_count == 0) { continue; } w(g, ", "); if (x->function_count == 1) { // Only one different function used in this among. struct amongvec * v = x->b; for (int j = 0; j < x->literalstring_count; j++) { if (v[j].function) { write_varref(g, v[j].function); goto continue_outer_loop; } } fprintf(stderr, "function_count == 1 but no among functions\n"); exit(1); continue_outer_loop: continue; } w(g, "this.#af_"); write_int(g, x->number); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': write_varref(g, p->name); continue; case 'W': write_varname(g, p->name); continue; case 'L': write_literal_string(g, p->literalstring); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'P': write_string(g, g->options->parent_class_name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "(-1>>>1)"); break; case c_minint: write_string(g, "(~(-1>>>1))"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_divide: /* Snowball specifies integer division with semantics matching C, * so we need to use `Math.trunc(x/y)` here. */ write_string(g, "Math.trunc"); s = " / "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "this.c"); break; case c_limit: w(g, p->mode == m_forward ? "this.limit" : "this.limit_backward"); break; case c_len: /* Same as size() for Javascript. */ case c_size: w(g, "this.current.length"); break; case c_lenof: /* Same as sizeof() for Javascript. */ case c_sizeof: writef(g, "~V.length", p); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); bool end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (savevar && K_needed_node_on_f(p)) { write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g, g->failure_label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); } p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mthis.limit_backward = this.c; this.c = this.limit;~N", p); generate(g, p->left); w(g, "~Mthis.c = this.limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); writef(g, "~M~V();~N", p->left); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!this.go_~S1_grouping~S0(~V, ~I0, ~I1)", p); if (!is_goto) { if (p->mode == m_forward) w(g, "~Mthis.c++;~N"); else w(g, "~Mthis.c--;~N"); } } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = false; int golab = new_label(g); g->I[0] = golab; // This label may end up unused, but we can't easily tell at this point. w(g, "~M// deno-lint-ignore no-unused-labels~N"); w(g, "~Mgolab~I0: while (true)~N"); w(g, "~{"); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak golab~I0;~N"); } g->unreachable = false; wsetlab_end(g); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mfor (let /** number */ ~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--)~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { writef(g, "~Mwhile (true) {~N~+", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); } wsetlab_end(g); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~Mbreak;~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~{"); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mlet ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (this.c ~S0 "); generate_AE(g, p->AE); w(g, ") "); write_failure_after_if(g); g->unreachable = false; w(g, "~Mthis.c = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; g->S[1] = p->mode == m_forward ? "> this.limit" : "< this.limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. g->I[0] = p->AE->number; write_failure_if(g, "this.c ~S0 ~I0 ~S1", p); writef(g, "~Mthis.c ~S0= ~I0;~N", p); } else { w(g, "~{~Mconst /** number */ c = this.c ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); write_failure_if(g, "c ~S1 || c ~S2 this.c", p); writef(g, "~Mthis.c = c;~N", p); writef(g, "~}", p); } } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mthis.slice_del();~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~Mthis.c = this.limit;~N", p); } else { writef(g, "~Mthis.c = this.limit_backward;~N", p); } } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~Mthis.bra = this.c;~N", p); } else { writef(g, "~Mthis.ket = this.c;~N", p); } } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~Mthis.ket = this.c;~N", p); } else { writef(g, "~Mthis.bra = this.c;~N", p); } } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = this.current.slice(0, this.limit);~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = this.slice_to();~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) { w(g, "~{~Mconst /** number */ c = this.c;~N"); writef(g, "~Mthis.insert(c, c, ", p); } else { writef(g, "~Mthis.insert(this.c, this.c, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mthis.c = c;~N~}"); } static void generate_stringassign(struct generator * g, struct node * p) { write_comment(g, p); int keep_c = p->mode == m_forward; /* like 'attach' */ if (keep_c) { w(g, "~{~Mconst /** number */ c = this.c;~N"); if (p->mode == m_forward) { writef(g, "~Mthis.insert(c, this.limit, ", p); } else { writef(g, "~Mthis.insert(this.limit_backward, c, ", p); } } else { if (p->mode == m_forward) { writef(g, "~Mthis.insert(this.c, this.limit, ", p); } else { writef(g, "~Mthis.insert(this.limit_backward, this.c, ", p); } } generate_address(g, p); writef(g, ");~N", p); if (keep_c) { w(g, "~Mthis.c = c;~N~}"); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mthis.slice_from("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * varname = vars_newname(g); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (this.c ~S0 "); generate_AE(g, q->AE); w(g, ") "); write_failure_after_if(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mlet /** number */ ~B0 = this.limit;~N"); w(g, "~Mthis.limit = "); generate_AE(g, q->AE); w(g, ";~N"); w(g, "~M~B0 -= this.limit;~N"); } else { w(g, "~Mconst /** number */ ~B0 = this.limit_backward;~N"); w(g, "~Mthis.limit_backward = "); generate_AE(g, q->AE); w(g, ";~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "this.limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "this.limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); w(g, "~Mconst /** number */ ~B0 = "); if (p->mode == m_forward) { w(g, "this.limit - this.c;~N"); w(g, "~Mthis.limit = this.c;~N"); } else { w(g, "this.limit_backward;~N"); w(g, "~Mthis.limit_backward = this.c;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "this.limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "this.limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~{~N" "~Mconst /** !Object */ ~B0 = new ~P();~N", p); writef(g, "~M~B0.copy_from(this);~N", p); writef(g, "~Mthis.current = ~V;~N" "~Mthis.c = 0;~N" "~Mthis.limit_backward = 0;~N" "~Mthis.limit = this.current.length;~N", p); if (p->left->possible_signals == -1) { /* Assume failure. */ w(g, "~Mlet /**boolean*/ ~B0_f = true;~N"); } wsetlab_begin(g, g->failure_label); generate(g, p->left); if (p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_f = false;~N"); } wsetlab_end(g); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); writef(g, "~M~V = this.current;~N" "~Mthis.copy_from(~B0);~N", p); ++g->copy_from_count; if (p->left->possible_signals == 0) { // p->left always signals f. write_failure(g); } else if (p->left->possible_signals == -1) { write_failure_if(g, "~B0_f", p); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); if (p->AE->type == c_number && p->AE->number == 1) { if (strcmp(s, "+=") == 0) { // Optimise `+= 1` to increment. writef(g, "~M++~V;~N", p); return; } else if (strcmp(s, "-=") == 0) { // Optimise `-= 1` to decrement. writef(g, "~M--~V;~N", p); return; } } g->S[0] = s; writef(g, "~M~V ~S0 ", p); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif ("); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, ";~N"); } else { w(g, ") "); write_failure_after_if(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~Mreturn ~V();~N", p); p->right = NULL; g->unreachable = true; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~Mreturn ~V();~N", p); g->unreachable = true; return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V();~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V();~N", p); write_failure(g); } else { write_failure_if(g, "!~V()", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (tailcallable(g, p)) { writef(g, "~Mreturn this.~S1_grouping~S0(~V, ~I0, ~I1);~N", p); p->right = NULL; } else { write_failure_if(g, "!(this.~S1_grouping~S0(~V, ~I0, ~I1))", p); } } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn this.eq_s~S0(~V);~N", p); p->right = NULL; } else { write_failure_if(g, "!(this.eq_s~S0(~V))", p); } } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn this.eq_s~S0(~L);~N", p); p->right = NULL; } else { write_failure_if(g, "!(this.eq_s~S0(~L))", p); } } static void generate_define(struct generator * g, struct node * p) { write_newline(g); write_comment(g, p); writef(g, "~M/** @return {boolean} */~N" "~M#~W() {~+~N", p); /* Save output. */ struct str * saved_output = g->outbuf; struct str * saved_declarations = g->declarations; g->outbuf = str_new(); g->declarations = str_new(); g->next_label = 0; g->var_number = 0; struct name * q = p->name; if (q->amongvar_needed) { // The "among var" (`a`) is only assigned to, but the initialisation // can be generated in a nested block so it seems hard to declare it as // const and still have it visible when we want to use it. w(g, "~Mlet /** number */ a;~N"); } /* Declare localised variables. */ for (struct name * name = g->analyser->names; name; name = name->next) { if (name->local_to == q) { switch (name->type) { case t_string: w(g, "~Mlet /** string */ "); write_varname(g, name); w(g, ";~N"); break; case t_integer: w(g, "~Mlet /** number */ "); write_varname(g, name); w(g, ";~N"); break; case t_boolean: w(g, "~Mlet /** boolean */ "); write_varname(g, name); w(g, ";~N"); break; } } } str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; g->keep_count = 0; /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (p->left->possible_signals) { generate(g, p->left->right); } } w(g, "~-~M}~N"); if (q->type == t_external) { w(g, "~N"); w(g, "~M/**@return{string}*/~N"); writef(g, "~M~W(/**string*/input) {~+~N", p); w(g, "~Mthis.setCurrent(input);~N"); writef(g, "~Mthis.#~W();~N", p); w(g, "~Mreturn this.getCurrent();~N"); w(g, "~-~M}~N"); if (SIZE(q->s) == 4 && memcmp(q->s, "stem", 4) == 0) { w(g, "~N"); w(g, "~MstemWord = this.stem;~N"); } } str_append(saved_output, g->declarations); str_append(saved_output, g->outbuf); str_delete(g->declarations); str_delete(g->outbuf); g->declarations = saved_declarations; g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Ma = this.find_among~S0(a_~I0~F);~N", p); if (!x->always_matches) { write_failure_if(g, "a === 0", p); } } else if (x->always_matches) { writef(g, "~Mthis.find_among~S0(a_~I0~F);~N", p); } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~Mreturn this.find_among~S0(a_~I0~F) !== 0;~N", p); x->node->right = NULL; g->unreachable = true; } else { write_failure_if(g, "this.find_among~S0(a_~I0~F) === 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { if (x->same_action == c_slicefrom && x->command_count > 1) { if (x->nocommand_count > 0) { w(g, "~Mif (a > 0) {~N+"); } write_comment(g, x->commands[0]); g->I[0] = x->number; w(g, "~Mthis.slice_from(as_~I0[a - 1]);~N"); if (x->nocommand_count > 0) { write_block_end(g); } g->unreachable = false; return; } w(g, "~Mswitch (a) {~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0: {~N~+"); generate(g, x->commands[i - 1]); if (!g->unreachable) w(g, "~Mbreak;~N"); w(g, "~-~M}~N"); g->unreachable = false; } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~Mreturn !~V;~N", p); } else { writef(g, "~Mreturn ~V;~N", p); } p->right = NULL; g->unreachable = true; return; } if (inverted) { write_failure_if(g, "~V", p); } else { write_failure_if(g, "!~V", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mthis.debug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: /* Snowball specifies integer division with semantics matching C, * so we need to use `Math.trunc(x/y)` here. */ writef(g, "~M~V = Math.trunc(~V / ", p); generate_AE(g, p->AE); w(g, ");~N"); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "import ~P from './base-stemmer.js'~N" "~N" "export default class extends ~P {~+~N"); write_newline(g); } static void generate_class_end(struct generator * g) { w(g, "~-}~N"); w(g, "~N"); } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Mconst a_~I0 = [~N~+"); for (int i = 0; i < x->literalstring_count; i++) { if (i) w(g, ",~N"); g->I[0] = v[i].result; w(g, "~M["); write_literal_string(g, v[i].b); w(g, ", ~I0"); if (v[i].i >= 0 || v[i].function != NULL) { w(g, ", "); write_int(g, v[i].i >= 0 ? i - v[i].i : 0); } if (v[i].function != NULL) { w(g, ", "); write_int(g, v[i].function_index); } w(g, "]"); } w(g, "~N~-~M];~N~N"); if (x->same_action == c_slicefrom && x->command_count > 1) { g->I[0] = x->number; w(g, "~Mconst /** Array */ as_~I0 = ["); for (int i = 1; i <= x->command_count; i++) { if (i > 1) w(g, ", "); write_literal_string(g, x->commands[i - 1]->left->literalstring); } w(g, "];~N~N"); } } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void generate_among_dispatcher(struct generator * g, struct among * x) { if (x->function_count <= 1) return; struct amongvec * v = x->b; write_comment(g, x->node); g->I[0] = x->number; w(g, "~N~M/** @return {boolean} */~N"); w(g, "~M#af_~I0() {~N~+"); w(g, "~Mswitch (this.af) {~N~+"); for (int n = 1; n <= x->function_count; n++) { w(g, "~Mcase "); write_int(g, n); w(g, ": return "); for (int i = 0; i < x->literalstring_count; i++) { if (v[i].function_index == n) { write_varref(g, v[i].function); w(g, "();~N"); break; } } } w(g, "~-~M}~N"); w(g, "~Mreturn false;~N"); w(g, "~-~M}~N"); } static void generate_among_dispatchers(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_dispatcher(g, x); } } static void set_bit(symbol * b, int i) { b[i >> 3] |= 1 << (i & 7); } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7) / 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); w(g, "~Mconst /** Array */ "); write_varname(g, q->name); write_string(g, " = ["); for (int i = 0; i < size; i++) { if (i) w(g, ", "); write_int(g, map[i]); } w(g, "];~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { bool wrote_members = false; for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: w(g, "~M#"); write_varname(g, q); w(g, "/** string */ = '';~N"); wrote_members = true; break; case t_integer: w(g, "~M#"); write_varname(g, q); w(g, "/** number */ = 0;~N"); wrote_members = true; break; case t_boolean: w(g, "~M#"); write_varname(g, q); w(g, "/** boolean */ = false;~N"); wrote_members = true; break; } } if (wrote_members) w(g, "~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_js(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "// ", NULL); // Disable some deno-lint warnings which the generated code can trigger. w(g, "// deno-lint-ignore-file" // Some of our generated deno-lint-ignore comments may not be used. " ban-unused-ignore" // Expressions in conditionals may be constant. " no-constant-condition" // Empty blocks may be generated in some cases. " no-empty" // Among var `a` is only assigned to once per `among`, but is declared // at the start of the function and may be initialised in a nested // block. // // Some localised variables may only be assigned to once. " prefer-const~N~N"); generate_amongs(g); generate_groupings(g); generate_class_begin(g); generate_members(g); generate_among_dispatchers(g); generate_methods(g); generate_class_end(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.1.0/compiler/generator_pascal.c000066400000000000000000001314731520373054300206360ustar00rootroot00000000000000#include #include #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" #define BASE_UNIT "SnowballProgram" #define BASE_CLASS "T" BASE_UNIT /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { // Relational operators are the same as C, save for (in-)equality. switch (relop) { case c_eq: write_string(g, " = "); break; case c_ne: write_string(g, " <> "); break; default: write_c_relop(g, relop); } } static void write_varname(struct generator * g, struct name * p) { if (p->type == t_external) { if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } } else { /* Pascal identifiers are case-insensitive but Snowball identifiers * should be case-sensitive. To address this, if any groups of * identifiers of the same type have the same case, we insert a counter * after the type-code for all but one of them. This count is unique * within each type of variable so this avoid collisions while being * minimally intrusive on the readability of the generated code. * * So for example: * * Snowball integer `i` -> Pascal `I_i` * Snowball integer `I` -> Pascal `I2_i` * * We don't try to solve this problem for external identifiers - it * seems more helpful to leave those alone and encourage snowball * program authors to avoid naming externals which only differ by * case. * * We use the same naming scheme for both global and local variables. */ write_char(g, "SBIrxg"[p->type]); if (p->case_collision) { write_int(g, p->count); } write_char(g, '_'); } write_s(g, p->s); } static void write_literal_string(struct generator * g, symbol * p) { if (SIZE(p) == 0) { write_string(g, "''"); return; } bool in_quotes = false; for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 127) { if (!in_quotes) { write_char(g, '\''); in_quotes = true; } if (ch == '\'') write_char(g, '\''); write_char(g, ch); } else { if (in_quotes) { write_char(g, '\''); in_quotes = false; } write_char(g, '#'); write_int(g, ch); } } if (in_quotes) { write_char(g, '\''); } } /* Write a variable declaration. */ static void write_declare(struct generator * g, const char * declaration, struct node * p) { struct str * temp = g->outbuf; g->outbuf = g->declarations; write_string(g, " "); writef(g, declaration, p); write_char(g, ';'); write_newline(g); g->outbuf = temp; } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p, NULL); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~MBegin~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~MEnd;~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "FLimit - "; write_declare(g, "~B0 : Integer", p); writef(g, "~M~B0 := ~S1FCursor;~N" , p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "FCursor := "); if (p->mode != m_forward) str_append_string(out, "FLimit - "); str_append(out, savevar); str_append_ch(out, ';'); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "Inc(FCursor);" : "Dec(FCursor);"); write_newline(g); } static void wsetl(struct generator * g, int n) { w(g, "lab"); write_int(g, n); w(g, ":~N"); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "Begin Result := False; Exit; End;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_char(g, ';'); g->label_used = 1; } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~MIf (", p); writef(g, s, p); writef(g, ") Then~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "FCursor >= FLimit", p); } else { write_failure_if(g, "FCursor <= FBkLimit", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': case 'W': write_varname(g, p->name); continue; case 'L': write_literal_string(g, p->literalstring); continue; case 's': write_int(g, SIZE(p->literalstring)); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_s(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varname(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "MAXINT"); break; case c_minint: write_string(g, "(-MAXINT - 1)"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_divide: s = " div "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "FCursor"); break; case c_limit: w(g, p->mode == m_forward ? "FLimit" : "FBkLimit"); break; case c_len: case c_size: w(g, "Length(current)"); break; case c_lenof: case c_sizeof: writef(g, "Length(~V)", p); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = true; write_comment(g, p); w(g, "~MRepeat~N~+"); if (savevar && K_needed_node_on_f(p)) { write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; g->label_used = 0; generate(g, p); if (!g->unreachable) { w(g, "~MBreak;~N"); end_unreachable = false; } if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); } p = p->right; } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); w(g, "~-~MUntil True;~N"); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g,"~MFBkLimit := FCursor; FCursor := FLimit;~N", p); generate(g, p->left); w(g, "~MFCursor := FBkLimit;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_block_start(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); if (g->label_used) wsetl(g, label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); write_block_end(g); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V := True;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V := False;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); writef(g, "~M~V();~N", p->left); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "Bk"; g->S[1] = complement ? "In" : "Out"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "Not (Go~S1Grouping~S0(~V, ~I0, ~I1))", p); if (!is_goto) { w(g, p->mode == m_forward ? "~MInc(FCursor);~N" : "~MDec(FCursor);~N"); } } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = false; w(g, "~MWhile True Do~N"); write_block_start(g); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); w(g, "~MBreak;~N"); } g->unreachable = false; if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); write_declare(g, "~B0 : Integer", p); w(g, "~MFor ~B0 := "); generate_AE(g, p->AE); writef(g, " DownTo 1 Do~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { int replab = new_label(g); g->I[0] = replab; writef(g, "lab~I0:~N~{", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~MDec(~B0);~N"); } g->I[0] = replab; w(g, "~Mgoto lab~I0;~N"); } if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~{"); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); write_declare(g, "~B0 : Integer", p); w(g, "~M~B0 := "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~MIf (FCursor ~S0 "); generate_AE(g, p->AE); w(g, ") Then~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~MFCursor := "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~{~MC := FCursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->S[1] = p->mode == m_forward ? "> FLimit" : "< FBkLimit"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "C ~S1", p); } else { write_failure_if(g, "(C ~S1) Or (C ~S2 FCursor)", p); } writef(g, "~MFCursor := C;~N", p); writef(g, "~}", p); g->temporary_used = true; } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~MSliceDel;~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "FLimit" : "FBkLimit"; writef(g, "~MFCursor := ~S0;~N", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "FBra" : "FKet"; writef(g, "~M~S0 := FCursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "FKet" : "FBra"; writef(g, "~M~S0 := FCursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V := AssignTo();~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V := SliceTo();~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varname(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) { w(g, "~{~MC := FCursor;~N"); g->temporary_used = true; } writef(g, "~Minsert(FCursor, FCursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~MFCursor := C;~N~}"); } static void generate_stringassign(struct generator * g, struct node * p) { write_comment(g, p); int keep_c = p->mode == m_forward; /* like 'attach' */ if (keep_c) { writef(g, "~{~MC := FCursor;~N", p); g->temporary_used = true; } if (p->mode == m_forward) { writef(g, "~Minsert(FCursor, FLimit, ", p); } else { writef(g, "~Minsert(FBkLimit, FCursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) { w(g, "~MFCursor := c;~N~}"); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~MSliceFrom("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * varname = vars_newname(g); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~MIf (FCursor ~S0 "); generate_AE(g, q->AE); w(g, ") Then~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); write_declare(g, "~B0 : Integer", p); if (p->mode == m_forward) { w(g, "~M~B0 := FLimit;~N"); w(g, "~MFLimit := "); generate_AE(g, q->AE); w(g, ";~N"); w(g, "~M~B0 := ~B0 - FLimit;~N"); } else { w(g, "~M~B0 := FBkLimit;~N"); w(g, "~MFBkLimit := "); generate_AE(g, q->AE); w(g, ";~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "FLimit := FLimit + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "FBkLimit := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); write_declare(g, "~B0 : Integer", p); if (p->mode == m_forward) { w(g, "~M~B0 := FLimit - FCursor;~N"); w(g, "~MFLimit := FCursor;~N"); } else { w(g, "~M~B0 := FBkLimit;~N"); w(g, "~MFBkLimit := FCursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "FLimit := FLimit + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "FBkLimit := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); write_declare(g, "~B0_Current : AnsiString", p); write_declare(g, "~B0_Cursor : Integer", p); write_declare(g, "~B0_Limit : Integer", p); write_declare(g, "~B0_BkLimit : Integer", p); write_declare(g, "~B0_Bra : Integer", p); write_declare(g, "~B0_Ket : Integer", p); writef(g, "~{" "~M~B0_Current := FCurrent;~N" "~M~B0_Cursor := FCursor;~N" "~M~B0_Limit := FLimit;~N" "~M~B0_BkLimit := FBkLimit;~N" "~M~B0_Bra := FBra;~N" "~M~B0_Ket := FKet;~N" "~MFCurrent := ~V;~N" "~MFCursor := 0;~N" "~MFLimit := Length(current);~N", p); if (p->left->possible_signals == -1) { /* Assume failure. */ write_declare(g, "~B0_F : Boolean", p); w(g, "~M~B0_F := True;~N"); } generate(g, p->left); if (!g->unreachable && p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_F := False;~N"); } if (g->label_used) wsetl(g, g->failure_label); g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); writef(g, "~M~V := FCurrent;~N" "~MFCurrent := ~B0_Current;~N" "~MFCursor := ~B0_Cursor;~N" "~MFLimit := ~B0_Limit;~N" "~MFBkLimit := ~B0_BkLimit;~N" "~MFBra := ~B0_Bra;~N" "~MFKet := ~B0_Ket;~N", p); if (p->left->possible_signals == 0) { // p->left always signals f. w(g, "~M~f~N"); } else if (p->left->possible_signals == -1) { write_failure_if(g, "~B0_F", p); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); writef(g, "~M~W := ", p); if (s != NULL) { g->S[0] = s; writef(g, "~W ~S0 ", p); } generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~MResult := "); p->right = NULL; } else { w(g, "~MIf "); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, "~N"); } else { w(g, " Then~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~MResult := ~V;~N", p); p->right = NULL; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~MBegin; Result := ~V; Exit; End;~N", p); g->unreachable = true; return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V;~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V;~N", p); write_failure(g); } else { write_failure_if(g, "Not ~V", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "Bk"; g->S[1] = complement ? "Out" : "In"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (tailcallable(g, p)) { writef(g, "~MResult := ~S1Grouping~S0(~V, ~I0, ~I1);~N", p); p->right = NULL; } else { write_failure_if(g, "Not (~S1Grouping~S0(~V, ~I0, ~I1))", p); } } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "Bk"; if (tailcallable(g, p)) { writef(g, "~MResult := EqS~S0(~V);~N", p); p->right = NULL; } else { write_failure_if(g, "Not (EqS~S0(~V))", p); } } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "Bk"; if (tailcallable(g, p)) { writef(g, "~MResult := EqS~S0(~L);~N", p); p->right = NULL; } else { write_failure_if(g, "Not (EqS~S0(~L))", p); } } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; write_newline(g); write_comment(g, p); /* Generate function header. */ writef(g, "~MFunction T~n.~W : Boolean;~N", p); /* Save output. */ struct str * saved_output = g->outbuf; struct str * saved_declarations = g->declarations; g->outbuf = str_new(); g->declarations = str_new(); g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ w(g, "~{"); g->temporary_used = false; generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (p->left->possible_signals) { generate(g, p->left->right); } } w(g, "~}"); if (q->amongvar_needed) { str_append_string(g->declarations, " AmongVar : Integer;\n"); } if (g->temporary_used) { str_append_string(g->declarations, " C : Integer;\n"); } /* Declare localised variables. */ struct str * temp = g->outbuf; g->outbuf = g->declarations; for (struct name * name = g->analyser->names; name; name = name->next) { if (name->local_to == q) { switch (name->type) { case t_string: w(g, " "); write_varname(g, name); w(g, " : AnsiString;\n"); break; case t_integer: w(g, " "); write_varname(g, name); w(g, " : Integer;\n"); break; case t_boolean: w(g, " "); write_varname(g, name); w(g, " : Boolean;\n"); break; } } } g->outbuf = temp; if (str_len(g->declarations) > 0) { str_append_string(saved_output, "Var\n"); str_append(saved_output, g->declarations); } if (g->next_label) { str_append_string(saved_output, "Label\n"); int num = g->next_label; for (int i = 0; i < num; ++i) { str_append_string(saved_output, " lab"); str_append_int(saved_output, i); str_append_string(saved_output, i == num - 1 ? ";\n" : ",\n"); } } str_append(saved_output, g->outbuf); str_delete(g->declarations); str_delete(g->outbuf); g->declarations = saved_declarations; g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~MResult := True;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "Bk"; g->I[0] = x->number; g->I[1] = x->literalstring_count; if (x->amongvar_needed) { writef(g, "~MAmongVar := FindAmong~S0(a_~I0, ~I1);~N", p); if (!x->always_matches) { write_failure_if(g, "AmongVar = 0", p); } } else if (x->always_matches) { writef(g, "~MFindAmong~S0(a_~I0, ~I1);~N", p); } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~MResult := FindAmong~S0(a_~I0, ~I1) <> 0;~N", p); x->node->right = NULL; } else { write_failure_if(g, "FindAmong~S0(a_~I0, ~I1) = 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~MCase AmongVar Of~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~M~I0:~N~{"); generate(g, x->commands[i - 1]); w(g, "~}"); g->unreachable = false; } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~MResult := !~V;~N", p); } else { writef(g, "~MResult := ~V;~N", p); } p->right = NULL; return; } if (inverted) { write_failure_if(g, "~V", p); } else { write_failure_if(g, "Not (~V)", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, NULL); break; case c_plusassign: generate_integer_assign(g, p, "+"); break; case c_minusassign: generate_integer_assign(g, p, "-"); break; case c_multiplyassign:generate_integer_assign(g, p, "*"); break; case c_divideassign: generate_integer_assign(g, p, "div"); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } /* Class declaration generation. */ static void generate_unit_start(struct generator * g) { write_start_comment(g, "// ", NULL); w(g, "Unit ~n;~N~N{$HINTS OFF}~N~NInterface~N~NUses " BASE_UNIT ";~N"); } static void generate_unit_end(struct generator * g) { w(g, "~NEnd.~N"); } static void generate_class_begin(struct generator * g) { w(g, "~NType~N~+~MT~n = Class(" BASE_CLASS ")~N"); } static void generate_class_end(struct generator * g) { w(g, "~}~NImplementation~N"); } static void generate_method_decl(struct generator * g, struct name * q) { w(g, "~MFunction "); write_varname(g, q); w(g, " : Boolean;"); if (q->type == t_external) { if (SIZE(q->s) == 4 && memcmp(q->s, "stem", 4) == 0) { w(g, " Override;"); } } w(g, "~N"); } static void generate_method_decls(struct generator * g) { w(g, "~Mpublic~N~+"); w(g, "~MConstructor Create;~N"); for (struct name * q = g->analyser->names; q; q = q->next) { if (q->type == t_external) { generate_method_decl(g, q); } } w(g, "~-"); bool first = true; for (struct name * q = g->analyser->names; q; q = q->next) { if (q->type == t_routine) { if (first) { w(g, "~Mprivate~N~+"); first = false; } generate_method_decl(g, q); } } if (!first) w(g, "~-"); } static void generate_member_decls(struct generator * g) { bool first = true; for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: case t_integer: case t_boolean: if (first) { w(g, "~Mprivate~N~+"); first = false; } switch (q->type) { case t_string: write_margin(g); write_varname(g, q); w(g, " : AnsiString;~N"); break; case t_integer: write_margin(g); write_varname(g, q); w(g, " : Integer;~N"); break; case t_boolean: write_margin(g); write_varname(g, q); w(g, " : Boolean;~N"); break; } } } if (!first) w(g, "~-"); } static void generate_among_decls(struct generator * g) { struct among *a = g->analyser->amongs; if (a == NULL) return; w(g, "~Mprivate~N~+"); while (a != NULL) { g->I[0] = a->number; w(g, "~Ma_~I0 : Array Of TAmong;~N"); a = a->next; } w(g, "~-"); } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; g->I[1] = x->literalstring_count; w(g, "~MSetLength(a_~I0, ~I1);~N~+"); for (int i = 0; i < x->literalstring_count; i++) { g->I[1] = i; /* Write among's string. */ w(g, "~Ma_~I0[~I1].Str := "); write_literal_string(g, v[i].b); w(g, ";~N"); /* Write among's index & result. */ g->I[2] = v[i].i; w(g, "~Ma_~I0[~I1].Index := ~I2;~N"); g->I[2] = v[i].result; w(g, "~Ma_~I0[~I1].Result := ~I2;~N"); /* Write among's handler. */ w(g, "~Ma_~I0[~I1].Method := "); if (v[i].function == NULL) { w(g, "nil;~N~N"); } else { w(g, "Self."); write_varname(g, v[i].function); w(g, ";~N~N"); } } w(g, "~-"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void generate_constructor(struct generator * g) { w(g, "~N~MConstructor T~n.Create;~N~{"); generate_amongs(g); w(g, "~}"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } static void set_bit(symbol * b, int i) { b[i >> 3] |= 1 << (i & 7); } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7) / 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->I[0] = size - 1; w(g, "~N~MConst~+~N~M"); write_varname(g, q->name); w(g, " : Array [0..~I0] Of Char = (~N~+"); for (int i = 0; i < size; i++) { if (i != 0) w(g, ",~N"); g->I[0] = map[i]; w(g, "~MChr(~I0)"); } w(g, "~N~-~M);~N~-"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } } extern void generate_program_pascal(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); generate_unit_start(g); /* Generate class declaration. */ generate_class_begin(g); generate_member_decls(g); generate_among_decls(g); generate_method_decls(g); generate_class_end(g); /* generate implementation. */ generate_groupings(g); generate_constructor(g); generate_methods(g); generate_unit_end(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.1.0/compiler/generator_php.c000066400000000000000000001222521520373054300201550ustar00rootroot00000000000000#include #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { // Relational operators are the same as C, save for (in-)equality. switch (relop) { case c_eq: write_string(g, " === "); break; case c_ne: write_string(g, " !== "); break; default: write_c_relop(g, relop); } } static void write_varname(struct generator * g, struct name * p) { if (p->type == t_external) { if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } } else { // We use the same naming scheme for both global and local variables. write_char(g, "SBIrxG"[p->type]); write_char(g, '_'); } write_s(g, p->s); } /* Reference to variable, e.g. when assigning to or using in an expression. */ static void write_varref(struct generator * g, struct name * p) { if (p->type == t_grouping) { write_string(g, "self::"); } else if (p->local_to) { write_char(g, '$'); } else { write_string(g, "$this->"); } write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { int i = 0; write_char(g, '"'); while (i < SIZE(p)) { int ch; i += get_utf8(p + i, &ch); if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\' || ch == '$') write_char(g, '\\'); write_char(g, ch); } else { write_string(g, "\\u{"); write_hex4(g, ch); write_string(g, "}"); } } write_char(g, '"'); } static void write_literal_char(struct generator * g, symbol ch) { write_char(g, '"'); if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\') write_char(g, '\\'); write_char(g, ch); } else { write_string(g, "\\u{"); write_hex4(g, ch); write_string(g, "}"); } write_char(g, '"'); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p, NULL); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "$this->limit - "; writef(g, "~M~B0 = ~S1$this->cursor;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "$this->cursor = "); if (p->mode != m_forward) str_append_string(out, "$this->limit - "); str_append(out, savevar); str_append_ch(out, ';'); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "$this->inc_cursor();" : "$this->dec_cursor();"); write_newline(g); } static void wsetl(struct generator * g, int n) { w(g, "~-~Mlab~+"); write_int(g, n); w(g, ":~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "goto lab"); write_int(g, n); write_char(g, ';'); write_newline(g); g->unreachable = true; } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "return false;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_char(g, ';'); g->label_used = 1; } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, const char * s, struct node * p) { w(g, "~Mif ("); writef(g, s, p); w(g, ") {~+~N"); write_failure(g); w(g, "~-~M}~N"); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "$this->cursor >= $this->limit", p); } else { write_failure_if(g, "$this->cursor <= $this->limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': write_varref(g, p->name); continue; case 'W': write_varname(g, p->name); continue; case 'L': write_literal_string(g, p->literalstring); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_s(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "PHP_INT_MAX"); break; case c_minint: write_string(g, "PHP_INT_MIN"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_divide: /* Snowball specifies integer division with semantics matching C, * so we need to use `intdiv($x, $y)` here. */ write_string(g, "intdiv"); s = ", "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "$this->cursor"); break; case c_limit: w(g, p->mode == m_forward ? "$this->limit" : "$this->limit_backward"); break; case c_len: w(g, "mb_strlen($this->current, 'UTF-8')"); break; case c_size: w(g, "strlen($this->current)"); break; case c_lenof: writef(g, "mb_strlen(~V, 'UTF-8')", p); break; case c_sizeof: writef(g, "strlen(~V)", p); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = -1; bool end_unreachable = true; write_comment(g, p); if (savevar && K_needed_node_on_f(p)) { write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; g->label_used = 0; generate(g, p); if (!g->unreachable) { if (out_lab < 0) out_lab = new_label(g); wgotol(g, out_lab); end_unreachable = false; } if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); } p = p->right; } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); if (out_lab >= 0) { wsetl(g, out_lab); } if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M$this->limit_backward = $this->cursor;~N" "~M$this->cursor = $this->limit;~N", p); generate(g, p->left); w(g, "~M$this->cursor = $this->limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); if (g->label_used) wsetl(g, label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); writef(g, "~M~V();~N", p->left); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; write_failure_if(g, "!$this->go_~S1_grouping~S0(~V)", p); if (!is_goto) { if (p->mode == m_forward) w(g, "~M$this->inc_cursor();~N"); else w(g, "~M$this->dec_cursor();~N"); } } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = false; w(g, "~Mwhile (true) {~N~+"); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); w(g, "~Mbreak;~N"); } g->unreachable = false; if (g->label_used) wsetl(g, g->failure_label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mfor (~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--) {~+~N", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { writef(g, "~Mwhile (true) {~N~+", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); } if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~Mbreak;~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~M~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif ($this->cursor ~S0 "); generate_AE(g, p->AE); w(g, ") {~+~N"); write_failure(g); w(g, "~-~M}~N"); g->unreachable = false; w(g, "~M$this->cursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); // Generate the AE to a temporary block so we can substitute it in // write_failure_if(). struct str * ae = str_new(); struct str * s = g->outbuf; g->outbuf = ae; generate_AE(g, p->AE); g->outbuf = s; g->B[0] = str_data(ae); g->S[0] = p->mode == m_forward ? "" : "_back"; g->S[1] = p->AE->type == c_number ? "" : "_checked"; write_failure_if(g, "!$this->hop~S0~S1(~B0)", p); str_delete(ae); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M$this->slice_del();~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~M$this->cursor = $this->limit;~N", p); } else { writef(g, "~M$this->cursor = $this->limit_backward;~N", p); } } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~M$this->bra = $this->cursor;~N", p); } else { writef(g, "~M$this->ket = $this->cursor;~N", p); } } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~M$this->ket = $this->cursor;~N", p); } else { writef(g, "~M$this->bra = $this->cursor;~N", p); } } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = substr($this->current, 0, $this->limit);~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = $this->slice_to();~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) { w(g, "~M$c = $this->cursor;~N"); writef(g, "~M$this->insert($c, $c, ", p); } else { writef(g, "~M$this->insert($this->cursor, $this->cursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~M$this->cursor = $c;~N"); } static void generate_stringassign(struct generator * g, struct node * p) { write_comment(g, p); int keep_c = p->mode == m_forward; /* like 'attach' */ if (keep_c) { w(g, "~M$c = $this->cursor;~N"); if (p->mode == m_forward) { writef(g, "~M$this->insert($c, $this->limit, ", p); } else { writef(g, "~M$this->insert($this->limit_backward, $c, ", p); } } else { if (p->mode == m_forward) { writef(g, "~M$this->insert($this->cursor, $this->limit, ", p); } else { writef(g, "~M$this->insert($this->limit_backward, $this->cursor, ", p); } } generate_address(g, p); writef(g, ");~N", p); if (keep_c) { w(g, "~M$this->cursor = $c;~N"); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~M$this->slice_from("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * varname = vars_newname(g); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif ($this->cursor ~S0 "); generate_AE(g, q->AE); w(g, ") {~+~N"); write_failure(g); w(g, "~-~M}~N"); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~M~B0 = $this->limit;~N"); w(g, "~M$this->limit = "); generate_AE(g, q->AE); w(g, ";~N"); w(g, "~M~B0 -= $this->limit;~N"); } else { w(g, "~M~B0 = $this->limit_backward;~N"); w(g, "~M$this->limit_backward = "); generate_AE(g, q->AE); w(g, ";~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "$this->limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "$this->limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); w(g, "~M~B0 = "); if (p->mode == m_forward) { w(g, "$this->limit - $this->cursor;~N"); w(g, "~M$this->limit = $this->cursor;~N"); } else { w(g, "$this->limit_backward;~N"); w(g, "~M$this->limit_backward = $this->cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "$this->limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "$this->limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; g->label_used = 0; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~M~B0 = clone $this;~N", p); writef(g, "~M$this->current = ~V;~N" "~M$this->cursor = 0;~N" "~M$this->limit_backward = 0;~N" "~M$this->limit = strlen($this->current);~N", p); if (p->left->possible_signals == -1) { /* Assume failure. */ w(g, "~M~B0_f = true;~N"); } generate(g, p->left); if (!g->unreachable && p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_f = false;~N"); } if (g->label_used) wsetl(g, g->failure_label); g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); writef(g, "~M~V = $this->current;~N" "~M$this->copyFrom(~B0);~N", p); if (p->left->possible_signals == 0) { // p->left always signals f. w(g, "~M~f~N"); } else if (p->left->possible_signals == -1) { write_failure_if(g, "~B0_f", p); } str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); if (p->AE->type == c_number && p->AE->number == 1) { if (strcmp(s, "+=") == 0) { // Optimise `+= 1` to increment. writef(g, "~M++~V;~N", p); return; } else if (strcmp(s, "-=") == 0) { // Optimise `-= 1` to decrement. writef(g, "~M--~V;~N", p); return; } } g->S[0] = s; writef(g, "~M~V ~S0 ", p); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif ("); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, ";~N"); } else { w(g, ") {~+~N"); write_failure(g); w(g, "~-~M}~N"); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~Mreturn ~V();~N", p); p->right = NULL; g->unreachable = true; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~Mreturn ~V();~N", p); g->unreachable = true; return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V();~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V();~N", p); write_failure(g); } else { write_failure_if(g, "!~V()", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; if (tailcallable(g, p)) { writef(g, "~Mreturn $this->~S1_grouping~S0(~V);~N", p); p->right = NULL; } else { write_failure_if(g, "!($this->~S1_grouping~S0(~V))", p); } } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn $this->eq_s~S0(~V);~N", p); p->right = NULL; } else { write_failure_if(g, "!($this->eq_s~S0(~V))", p); } } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn $this->eq_s~S0(~L);~N", p); p->right = NULL; } else { write_failure_if(g, "!($this->eq_s~S0(~L))", p); } } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; write_newline(g); write_newline(g); write_comment(g, p); if (q->type == t_routine) { writef(g, "~Mprotected function ~W(): bool~N", p); } else { writef(g, "~Mpublic function ~W(): bool~N", p); } write_block_start(g); /* Save output. */ struct str * saved_output = g->outbuf; struct str * saved_declarations = g->declarations; g->outbuf = str_new(); g->declarations = str_new(); g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; g->keep_count = 0; /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (p->left->possible_signals) { generate(g, p->left->right); } } write_block_end(g); str_append(saved_output, g->declarations); str_append(saved_output, g->outbuf); str_delete(g->declarations); str_delete(g->outbuf); g->declarations = saved_declarations; g->outbuf = saved_output; } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~M$among_var = $this->find_among~S0(self::A_~I0);~N", p); if (!x->always_matches) { write_failure_if(g, "0 === $among_var", p); } } else if (x->always_matches) { writef(g, "~M$this->find_among~S0(self::A_~I0);~N", p); } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~Mreturn $this->find_among~S0(self::A_~I0) !== 0;~N", p); x->node->right = NULL; g->unreachable = true; } else { write_failure_if(g, "$this->find_among~S0(self::A_~I0) === 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { if (x->same_action == c_slicefrom && x->command_count > 1) { if (x->nocommand_count > 0) { w(g, "~Mif ($among_var > 0) {~N~+"); } write_comment(g, x->commands[0]); g->I[0] = x->number; w(g, "~M$this->slice_from(self::AS_~I0[$among_var - 1]);~N"); if (x->nocommand_count > 0) { write_block_end(g); } g->unreachable = false; return; } w(g, "~Mswitch ($among_var) {~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); if (!g->unreachable) w(g, "~Mbreak;~N"); w(g, "~-"); g->unreachable = false; } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~Mreturn !~V;~N", p); } else { writef(g, "~Mreturn ~V;~N", p); } p->right = NULL; g->unreachable = true; return; } if (inverted) { write_failure_if(g, "~V", p); } else { write_failure_if(g, "!~V", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~M$this->debug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: /* Snowball specifies integer division with semantics matching C, * so we need to use `intdiv($x,$y)` here. */ writef(g, "~M~V = intdiv(~V,", p); generate_AE(g, p->AE); w(g, ");~N"); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "class Snowball~n extends SnowballStemmer~N"); write_block_start(g); } static void generate_class_end(struct generator * g) { write_block_end(g); } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Mprivate const array A_~I0 = [~N~+"); for (int i = 0; i < x->literalstring_count; i++) { if (i) w(g, ",~N"); g->I[0] = v[i].i; g->I[1] = v[i].result; w(g, "~M["); write_literal_string(g, v[i].b); w(g, ", ~I0, ~I1"); if (v[i].function != NULL) { w(g, ", '"); write_varname(g, v[i].function); // callable w(g, "'"); } w(g, "]"); } w(g, "~N~-~M];~N~N"); if (x->same_action == c_slicefrom && x->command_count > 1) { g->I[0] = x->number; w(g, "~Mprivate const array AS_~I0 = ["); for (int i = 1; i <= x->command_count; i++) { if (i > 1) w(g, ", "); write_literal_string(g, x->commands[i - 1]->left->literalstring); } w(g, "];~N~N"); } } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void generate_grouping_table(struct generator * g, struct grouping * q) { symbol * b = q->b; w(g, "~Mprivate const array "); write_varname(g, q->name); write_string(g, " = ["); for (int i = 0; i < SIZE(b); i++) { if (i) w(g, ", "); write_literal_char(g, b[i]); w(g, "=>true"); } w(g, "];~N~N"); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { bool wrote_members = false; for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: w(g, "~Mprivate string $"); write_varname(g, q); w(g, " = '';~N"); wrote_members = true; break; case t_integer: w(g, "~Mprivate int $"); write_varname(g, q); w(g, " = 0;~N"); wrote_members = true; break; case t_boolean: w(g, "~Mprivate bool $"); write_varname(g, q); w(g, " = false;~N"); wrote_members = true; break; } } if (wrote_members) w(g, "~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_php(struct generator * g) { g->varname_prefix = "$v_"; g->outbuf = str_new(); g->failure_str = str_new(); w(g, "options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.1.0/compiler/generator_python.c000066400000000000000000001243721520373054300207140ustar00rootroot00000000000000#include #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label_python(struct generator * g) { int next_label = new_label(g); if (next_label > g->max_label) g->max_label = next_label; return next_label; } #define new_label(G) new_label_python(G) /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { // Relational operators are the same as C. write_c_relop(g, relop); } static void write_varname(struct generator * g, struct name * p) { switch (p->type) { case t_external: if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } break; case t_routine: write_string(g, "__"); /* FALLTHRU */ default: { // We use the same naming scheme for both global and local // variables. write_char(g, "SBIrxg"[p->type]); write_char(g, '_'); break; } } write_s(g, p->s); } /* Reference to variable, e.g. when assigning to or using in an expression. */ static void write_varref(struct generator * g, struct name * p) { if (p->type >= t_routine || p->local_to == NULL) { write_string(g, "self."); } write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { write_char(g, '"'); for (int i = 0; i < SIZE(p); i++) { int ch = p[i]; if ((32 <= ch && ch < 127) || (0xa0 < ch && ch < 0x590)) { if (ch == '"' || ch == '\\') write_char(g, '\\'); // Our Python generator uses ENC_WIDECHARS so we need to convert. write_wchar_as_utf8(g, ch); } else { // Use escapes for anything over 0x590 as a crude way to avoid // LTR characters affecting the rendering of source character // order in confusing ways. write_string(g, "\\u"); write_hex4(g, ch); } } write_char(g, '"'); } static void write_literal_char(struct generator * g, symbol ch) { write_char(g, '"'); if (32 <= ch && ch < 0x590 && ch != 127) { if (ch == '"' || ch == '\\') write_char(g, '\\'); // Python uses ENC_WIDECHARS so we need to convert. write_wchar_as_utf8(g, ch); } else { // Use escapes for anything over 0x590 as a crude way to avoid // LTR characters affecting the rendering of source character // order in confusing ways. write_string(g, "\\u"); write_hex4(g, ch); } write_char(g, '"'); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "# "); write_comment_content(g, p, NULL); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~+~N"); } static void write_block_end(struct generator * g) { w(g, "~-"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "self.limit - "; writef(g, "~M~B0 = ~S1self.cursor~N", p); } static void restore_string(struct node * p, struct str * out, struct str * savevar) { str_clear(out); str_append_string(out, "self.cursor = "); if (p->mode != m_forward) str_append_string(out, "self.limit - "); str_append(out, savevar); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { struct str * temp = str_new(); write_margin(g); restore_string(p, temp, savevar); write_str(g, temp); write_newline(g); str_delete(temp); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "self.cursor += 1" : "self.cursor -= 1"); write_newline(g); } static void wsetlab_begin(struct generator * g) { w(g, "~Mtry:~N~+"); } static void wsetlab_end(struct generator * g, int n) { g->I[0] = n; w(g, "~-~Mexcept lab~I0: pass~N"); // We can safely reuse this later in this function. g->next_label = n; } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } switch (g->failure_label) { case x_return: w(g, "~Mreturn False"); break; default: w(g, "~Mraise lab"); write_int(g, g->failure_label); w(g, "()"); } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, ":", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "self.cursor >= self.limit", p); } else { write_failure_if(g, "self.cursor <= self.limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'V': write_varref(g, p->name); continue; case 'W': write_varname(g, p->name); continue; case 'L': write_literal_string(g, p->literalstring); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_s(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "sys.maxsize"); break; case c_minint: write_string(g, "(~sys.maxsize)"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_divide: /* Snowball specifies integer division with semantics matching C, * so Python's `/` or `//` isn't suitable (`//` would be in cases * where we knew that the arguments had the same sign). */ write_string(g, "int"); s = " / "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "self.cursor"); break; case c_limit: w(g, p->mode == m_forward ? "self.limit" : "self.limit_backward"); break; case c_len: /* Same as size() for Python. */ case c_size: w(g, "len(self.current)"); break; case c_lenof: /* Same as sizeof() for Python. */ case c_sizeof: writef(g, "len(~V)", p); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = true; write_comment(g, p); w(g, "~Mwhile True:~N~+"); if (savevar && K_needed_node_on_f(p)) { write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g); generate(g, p); if (!g->unreachable) { w(g, "~Mbreak~N"); end_unreachable = false; } wsetlab_end(g, label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); } p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); w(g, "~Mbreak~N~-"); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mself.limit_backward = self.cursor~N" "~Mself.cursor = self.limit~N", p); generate(g, p->left); w(g, "~Mself.cursor = self.limit_backward~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } wsetlab_begin(g); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g, label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); restore_string(p, g->failure_str, savevar); } wsetlab_begin(g); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = True~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = False~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); writef(g, "~M~V()~N", p->left); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; write_failure_if(g, "not self.go_~S1_grouping~S0(~n.~W)", p); if (!is_goto) { if (p->mode == m_forward) w(g, "~Mself.cursor += 1~N"); else w(g, "~Mself.cursor -= 1~N"); } } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = false; w(g, "~Mwhile True:~N~+"); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); w(g, "~Mbreak~N"); } g->unreachable = false; wsetlab_end(g, label); if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); w(g, "~-"); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); if (p->AE->type == c_number && p->AE->number <= 4) { // Use a tuple instead of range() for small constant numbers of // iterations. w(g, "~Mfor _ in "); for (int i = p->AE->number; i > 0; --i) { w(g, "0"); if (i > 1) w(g, ", "); } writef(g, ":", p); } else { w(g, "~Mfor _ in range("); generate_AE(g, p->AE); writef(g, "):", p); } writef(g, "~{", p); generate(g, p->left); w(g, "~}"); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { writef(g, "~Mwhile True:~N~+", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0 -= 1~N"); } w(g, "~Mcontinue~N"); } wsetlab_end(g, label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } w(g, "~Mbreak~N~}"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~M~B0 = "); generate_AE(g, p->AE); w(g, "~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif self.cursor ~S0 "); generate_AE(g, p->AE); w(g, ":"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Mself.cursor = "); generate_AE(g, p->AE); writef(g, "~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; g->S[1] = p->mode == m_forward ? "> self.limit" : "< self.limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. g->I[0] = p->AE->number; write_failure_if(g, "self.cursor ~S0 ~I0 ~S1", p); w(g, "~Mself.cursor ~S0= ~I0~N"); } else { w(g, "~Mc = self.cursor ~S0 "); generate_AE(g, p->AE); w(g, "~N"); write_failure_if(g, "c ~S1 or c ~S2 self.cursor", p); writef(g, "~Mself.cursor = c~N", p); } } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mself.slice_del()~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "self.limit" : "self.limit_backward"; writef(g, "~Mself.cursor = ~S0~N", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "self.bra" : "self.ket"; writef(g, "~M~S0 = self.cursor~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "self.ket" : "self.bra"; writef(g, "~M~S0 = self.cursor~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = self.assign_to()~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = self.slice_to()~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~Mc = self.cursor~N"); writef(g, "~Mself.insert(self.cursor, self.cursor, ", p); generate_address(g, p); writef(g, ")~N", p); if (keep_c) w(g, "~Mself.cursor = c~N"); } static void generate_stringassign(struct generator * g, struct node * p) { write_comment(g, p); int keep_c = p->mode == m_forward; /* like 'attach' */ if (keep_c) { writef(g, "~Mc = self.cursor~N", p); } if (p->mode == m_forward) { writef(g, "~Mself.insert(self.cursor, self.limit, ", p); } else { writef(g, "~Mself.insert(self.limit_backward, self.cursor, ", p); } generate_address(g, p); writef(g, ")~N", p); if (keep_c) { w(g, "~Mself.cursor = c~N"); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mself.slice_from("); generate_address(g, p); writef(g, ")~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * savevar = vars_newname(g); struct str * varname = vars_newname(g); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif self.cursor ~S0 "); generate_AE(g, q->AE); w(g, ":"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~M~B0 = self.limit~N"); w(g, "~Mself.limit = "); generate_AE(g, q->AE); w(g, "~N"); w(g, "~M~B0 -= self.limit~N"); } else { w(g, "~M~B0 = self.limit_backward~N"); w(g, "~Mself.limit_backward = "); generate_AE(g, q->AE); w(g, "~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "self.limit += "); str_append(g->failure_str, varname); } else { str_assign(g->failure_str, "self.limit_backward = "); str_append(g->failure_str, varname); } } else { write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~M~B0 = self.limit - self.cursor~N"); w(g, "~Mself.limit = self.cursor~N"); } else { w(g, "~M~B0 = self.limit_backward~N"); w(g, "~Mself.limit_backward = self.cursor~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "self.limit += "); str_append(g->failure_str, varname); } else { str_assign(g->failure_str, "self.limit_backward = "); str_append(g->failure_str, varname); } } } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); str_delete(savevar); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~M~B0 = BaseStemmer()~N" "~M~B0.copy_from(self)~N", p); writef(g, "~Mself.current = ~V~N" "~Mself.cursor = 0~N" "~Mself.limit = len(self.current)~N", p); if (p->left->possible_signals == -1) { /* Assume failure. */ w(g, "~M~B0_f = True~N"); } wsetlab_begin(g); generate(g, p->left); if (!g->unreachable && p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_f = False~N"); } wsetlab_end(g, g->failure_label); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); writef(g, "~M~V = self.current~N" "~Msuper().copy_from(~B0)~N", p); if (p->left->possible_signals == 0) { // p->left always signals f. write_failure(g); } else if (p->left->possible_signals == -1) { write_failure_if(g, "~B0_f", p); } str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->S[0] = s; writef(g, "~M~V ~S0 ", p); generate_AE(g, p->AE); w(g, "~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif "); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, "~N"); } else { w(g, ":"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~Mreturn ~V()~N", p); p->right = NULL; g->unreachable = true; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~Mreturn ~V()~N", p); g->unreachable = true; return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~V()~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~V()~N", p); write_failure(g); } else { write_failure_if(g, "not ~V()", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; if (tailcallable(g, p)) { writef(g, "~Mreturn self.~S1_grouping~S0(~n.~W)~N", p); p->right = NULL; } else { write_failure_if(g, "not self.~S1_grouping~S0(~n.~W)", p); } } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn self.eq_s~S0(~V)~N", p); p->right = NULL; } else { write_failure_if(g, "not self.eq_s~S0(~V)", p); } } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); symbol * b = p->literalstring; if (SIZE(b) == 1) { /* It's quite common to compare with a single character literal string, * so just inline the simpler code for this case rather than making a * function call. */ if (p->mode == m_forward) { writef(g, "~Mif self.cursor == self.limit or self.current[self.cursor] != ~L:~f" "~Mself.cursor += 1~N", p); } else { writef(g, "~Mif self.cursor <= self.limit_backward or self.current[self.cursor - 1] != ~L:~f" "~Mself.cursor -= 1~N", p); } return; } g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn self.eq_s~S0(~L)~N", p); p->right = NULL; } else { write_failure_if(g, "not self.eq_s~S0(~L)", p); } } static void generate_define(struct generator * g, struct node * p) { write_newline(g); write_comment(g, p); writef(g, "~Mdef ~W(self):~+~N", p); g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (p->left->possible_signals) { generate(g, p->left->right); } } w(g, "~-"); } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn True~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Mamong_var = self.find_among~S0(~n.a_~I0)~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~Mself.find_among~S0(~n.a_~I0)~N", p); } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~Mreturn self.find_among~S0(~n.a_~I0) != 0~N", p); x->node->right = NULL; g->unreachable = true; } else { write_failure_if(g, "self.find_among~S0(~n.a_~I0) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { if (x->same_action == c_slicefrom && x->command_count > 1) { if (x->nocommand_count > 0) { w(g, "~Mif among_var > 0:~N~+"); } write_comment(g, x->commands[0]); g->I[0] = x->number; w(g, "~Mself.slice_from(~n.as_~I0[among_var - 1])~N"); if (x->nocommand_count > 0) { w(g, "~-"); } g->unreachable = false; return; } /* We dispatch the integer result in `among_var` with an if-chain, * which is O(n) unless Python has a special optimisation (and * profiling with the `timeit` module suggests it doesn't). There * doesn't appear to be a good alternative in Python (3.10 added * `match` but that seems to be aimed more at pattern matching rather * than O(1) dispatch of an integer and it was actually slower when we * tried generating it here). */ for (int i = 1; i <= x->command_count; i++) { if (i == x->command_count && x->nocommand_count == 0) { w(g, "~Melse:~N~+"); } else { g->I[0] = i; w(g, (i > 1 ? "~Melif" : "~Mif")); w(g, " among_var == ~I0:~N~+"); } generate(g, x->commands[i - 1]); w(g, "~-"); g->unreachable = false; } } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~Mreturn not ~V~N", p); } else { writef(g, "~Mreturn ~V~N", p); } p->right = NULL; g->unreachable = true; return; } if (inverted) { write_failure_if(g, "~V", p); } else { write_failure_if(g, "not ~V", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mself.debug(~I0, ~I1)~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: /* Snowball specifies integer division with semantics matching C, * so Python's `/=` or `//=` isn't suitable (`//=` would be in * cases where we knew that the arguments had the same sign). */ writef(g, "~M~V = int(~V / ", p); generate_AE(g, p->AE); w(g, ")~N"); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "from .basestemmer import "); write_string(g, g->options->parent_class_name); w(g, "~N" "from .among import Among~N" "~N" "~N" "class ~n("); write_string(g, g->options->parent_class_name); w(g, "):~N" "~+~M'''~N" "~MThis class implements the stemming algorithm defined by a snowball script.~N" "~M"); write_generated_comment_content(g); w(g, "~N" "~M'''~N" "~N"); } static void generate_among_table(struct generator * g, struct among * x) { write_newline(g); write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Ma_~I0 = [~N~+"); for (int i = 0; i < x->literalstring_count; i++) { if (i) w(g, ",~N"); g->I[0] = v[i].i; g->I[1] = v[i].result; w(g, "~MAmong("); write_literal_string(g, v[i].b); w(g, ", ~I0, ~I1"); if (v[i].function != NULL) { w(g, ", "); write_varname(g, v[i].function); } w(g, ")"); } w(g, "~N~-~M]~N"); if (x->same_action == c_slicefrom && x->command_count > 1) { g->I[0] = x->number; w(g, "~Mas_~I0 = ("); for (int i = 1; i <= x->command_count; i++) { if (i > 1) w(g, ", "); write_literal_string(g, x->commands[i - 1]->left->literalstring); } w(g, ")~N"); } } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void generate_grouping_table(struct generator * g, struct grouping * q) { symbol * b = q->b; write_margin(g); write_varname(g, q->name); if (SIZE(b) <= 3) { // The `s in g` test used in base-stemmer.py works whether g is a // set or a string. // // The generated code for the string is smaller, faster to construct, // and (for a small grouping) seems to be if anything fractionally // faster to test. // // FIXME: The threshold seems about right for Python 3.13 but we should // recheck periodically. write_string(g, " = "); write_literal_string(g, b); w(g, "~N"); return; } // We could use frozenset, but it seems slightly slower to construct which // adds to startup time. write_string(g, " = {"); for (int i = 0; i < SIZE(b); i++) { if (i) w(g, ", "); write_literal_char(g, b[i]); } w(g, "}~N~N"); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: write_string(g, " "); write_varname(g, q); w(g, " = \"\"~N"); break; case t_integer: write_string(g, " "); write_varname(g, q); w(g, " = 0~N"); break; case t_boolean: write_string(g, " "); write_varname(g, q); w(g, " = False~N"); break; } } } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } static void generate_label_classes(struct generator * g) { for (int i = 0; i <= g->max_label; i++) { g->I[0] = i; w(g, "~N~Nclass lab~I0(BaseException): pass~N"); } } extern void generate_program_python(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); g->max_label = -1; write_start_comment(g, "# ", NULL); if (g->analyser->int_limits_used) { /* sys.maxsize is used in the code generated for maxint and minint */ w(g, "import sys~N~N"); } generate_class_begin(g); generate_groupings(g); generate_members(g); generate_methods(g); generate_amongs(g); generate_label_classes(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.1.0/compiler/generator_rust.c000066400000000000000000001352511520373054300203660ustar00rootroot00000000000000#include #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { // Relational operators are the same as C. write_c_relop(g, relop); } static void write_varname(struct generator * g, struct name * p) { // We use the same naming scheme for both global and local variables. write_char(g, "SbirrG"[p->type]); write_char(g, '_'); write_s(g, p->s); } /* Reference to variable, e.g. when assigning to or using in an expression. */ static void write_varref(struct generator * g, struct name * p) { if (p->local_to == NULL) { write_string(g, "context."); } write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { int i = 0; write_char(g, '"'); while (i < SIZE(p)) { int ch; i += get_utf8(p + i, &ch); // Write out ASCII and lower Unicode printables as literal characters. // Use escapes for anything over 0x590 as a crude way to avoid LTR // characters affecting the rendering of source character order in // confusing ways. if ((32 <= ch && ch < 127) || (0xa0 < ch && ch < 0x590)) { if (ch == '"' || ch == '\\') write_char(g, '\\'); write_wchar_as_utf8(g, ch); } else { write_string(g, "\\u{"); write_hex4(g, ch); write_string(g, "}"); } } write_char(g, '"'); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p, NULL); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~+{~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "env.limit - "; writef(g, "~Mlet ~B0 = ~S1env.cursor;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "env.cursor = "); if (p->mode != m_forward) str_append_string(out, "env.limit - "); str_append(out, savevar); str_append_ch(out, ';'); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "env.next_char();" : "env.previous_char();"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { g->I[0] = n; w(g, "~M'lab~I0: loop {~N~+"); } static void wsetlab_end(struct generator * g, int n) { if (!g->unreachable) { g->I[0] = n; w(g, "~Mbreak 'lab~I0;~N"); } w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { g->I[0] = n; w(g, "~Mbreak 'lab~I0;~N"); g->unreachable = true; } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } switch (g->failure_label) { case x_return: w(g, "~Mreturn false;"); break; default: w(g, "~Mbreak 'lab"); write_int(g, g->failure_label); write_char(g, ';'); } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "env.cursor >= env.limit", p); } else { write_failure_if(g, "env.cursor <= env.limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'E': { // Write an external name. if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } write_s(g, p->name->s); continue; } case 'V': write_varref(g, p->name); continue; case 'W': write_varname(g, p->name); continue; case 'L': write_literal_string(g, p->literalstring); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_s(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "i32::MAX"); break; case c_minint: write_string(g, "i32::MIN"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_divide: s = " / "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "env.cursor"); break; case c_limit: w(g, p->mode == m_forward ? "env.limit" : "env.limit_backward"); break; case c_len: w(g, "(env.current.chars().count() as i32)"); break; case c_size: w(g, "(env.current.len() as i32)"); break; case c_lenof: writef(g, "(~V.chars().count() as i32)", p); break; case c_sizeof: writef(g, "(~V.len() as i32)", p); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); bool end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (savevar && K_needed_node_on_f(p)) { write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g, label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g, label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); } p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g,"~Menv.limit_backward = env.cursor;~N" "~Menv.cursor = env.limit;~N", p); generate(g, p->left); w(g, "~Menv.cursor = env.limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g, label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); writef(g, "~M~W(env, context);~N", p->left); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!env.go_~S1_grouping~S0(~W, ~I0, ~I1)", p); if (!is_goto) { write_margin(g); write_string(g, p->mode == m_forward ? "env.next_char();" : "env.previous_char();"); write_newline(g); } } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = false; int golab = new_label(g); g->I[0] = golab; w(g, "~M'golab~I0: loop {~N~+"); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak 'golab~I0;~N"); g->unreachable = true; } wsetlab_end(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); w(g, "~Mfor _ in 0.."); generate_AE(g, p->AE); writef(g, " {~+~N", p); generate(g, p->left); w(g, "~-~M}~N"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { int replab = new_label(g); g->I[0] = replab; writef(g, "~M'replab~I0: loop{~N~+", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); g->I[0] = g->failure_label; w(g, "~M'lab~I0: for _ in 0..1 {~N~+"); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0 -= 1;~N"); } g->I[0] = replab; w(g, "~Mcontinue 'replab~I0;~N"); } w(g, "~-~M}~N"); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->I[0] = replab; w(g, "~Mbreak 'replab~I0;~N~-~M}~N"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mlet mut ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif env.cursor ~S0 "); generate_AE(g, p->AE); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Menv.cursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); // Generate the AE to a temporary block so we can substitute it in // write_failure_if(). struct str * ae = str_new(); struct str * s = g->outbuf; g->outbuf = ae; generate_AE(g, p->AE); g->outbuf = s; g->B[0] = str_data(ae); g->S[0] = p->mode == m_forward ? "" : "_back"; g->S[1] = p->AE->type == c_number ? "" : "_checked"; write_failure_if(g, "!env.hop~S0~S1(~B0)", p); str_delete(ae); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Menv.slice_del();~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.limit" : "env.limit_backward"; writef(g, "~Menv.cursor = ~S0;~N", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.bra" : "env.ket"; writef(g, "~M~S0 = env.cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.ket" : "env.bra"; writef(g, "~M~S0 = env.cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = env.assign_to();~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = env.slice_to();~N", p); } static void generate_address(struct generator * g, struct node * p) { /* If we deal with a string variable which is of type String we need to * pass it by reference not by value. Literalstrings on the other hand are * of type &'static str so we can pass them by value. */ symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_char(g, '&'); write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~Mlet c = env.cursor;~N"); w(g, "~Mlet (bra, ket) = (env.cursor, env.cursor);~N"); writef(g, "~Menv.insert(bra, ket, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Menv.cursor = c;~N"); } static void generate_stringassign(struct generator * g, struct node * p) { write_comment(g, p); int keep_c = p->mode == m_forward; /* like 'attach' */ if (keep_c) { writef(g, "~Mlet c = env.cursor;~N", p); } /* Copying limits and cursors is necessary here because the rust * borrowchecker does not like taking something from someone you are about * to mutate... */ if (p->mode == m_forward) { writef(g, "~Mlet (bra, ket) = (env.cursor, env.limit);~N", p); } else { writef(g, "~Mlet (bra, ket) = (env.limit_backward, env.cursor);~N", p); } writef(g, "~Menv.insert(bra, ket, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) { w(g, "~Menv.cursor = c;~N"); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Menv.slice_from("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * varname = vars_newname(g); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif env.cursor ~S0 "); generate_AE(g, q->AE); w(g, " "); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mlet mut ~B0 = env.limit;~N"); w(g, "~Menv.limit = "); generate_AE(g, q->AE); w(g, ";~N"); w(g, "~M~B0 -= env.limit;~N"); } else { w(g, "~Mlet ~B0 = env.limit_backward;~N"); w(g, "~Menv.limit_backward = "); generate_AE(g, q->AE); w(g, ";~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "env.limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "env.limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mlet ~B0 = env.limit - env.cursor;~N"); w(g, "~Menv.limit = env.cursor;~N"); } else { w(g, "~Mlet ~B0 = env.limit_backward;~N"); w(g, "~Menv.limit_backward = env.cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "env.limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "env.limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~Mlet ~B0 = env.clone();~N" "~Menv.set_current_s(~V.clone());~N" "~Menv.cursor = 0;~N" "~Menv.limit = env.current.len() as i32;~N", p); if (p->left->possible_signals == -1) { /* Assume failure. */ w(g, "~Mlet mut ~B0_f = true;~N"); } wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable && p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_f = false;~N"); } wsetlab_end(g, g->failure_label); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); /* Update string variable; restore env. */ writef(g, "~M~V = env.current.clone().into_owned();~N" "~M*env = ~B0;~N", p); if (p->left->possible_signals == 0) { // p->left always signals f. write_failure(g); } else if (p->left->possible_signals == -1) { write_failure_if(g, "~B0_f", p); } str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->S[0] = s; writef(g, "~M~V ~S0 ", p); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif "); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, "~N"); } else { write_char(g, ' '); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~Mreturn ~W(env, context);~N", p); p->right = NULL; g->unreachable = true; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~Mreturn ~W(env, context);~N", p); g->unreachable = true; return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M~W(env, context);~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M~W(env, context);~N", p); write_failure(g); } else { write_failure_if(g, "!~W(env, context)", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (tailcallable(g, p)) { writef(g, "~Mreturn env.~S1_grouping~S0(~W, ~I0, ~I1);~N", p); p->right = NULL; } else { write_failure_if(g, "!env.~S1_grouping~S0(~W, ~I0, ~I1)", p); } } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn env.eq_s~S0(&~V);~N", p); p->right = NULL; } else { write_failure_if(g, "!env.eq_s~S0(&~V)", p); } } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; if (tailcallable(g, p)) { writef(g, "~Mreturn env.eq_s~S0(&~L);~N", p); p->right = NULL; } else { write_failure_if(g, "!env.eq_s~S0(&~L)", p); } } static void generate_setup_context(struct generator * g) { w(g, "~Mlet mut context = &mut Context {~+~N"); for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: write_margin(g); write_varname(g, q); w(g, ": String::new(),~N"); break; case t_integer: write_margin(g); write_varname(g, q); w(g, ": 0,~N"); break; case t_boolean: write_margin(g); write_varname(g, q); w(g, ": false,~N"); break; } } w(g, "~-~M};~N"); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; write_newline(g); write_comment(g, p); if (q->type == t_routine) { writef(g, "~Mfn ~W(env: &mut SnowballEnv, context: &mut Context) -> bool {~+~N", p); } else { writef(g, "~Mpub fn ~E(env: &mut SnowballEnv) -> bool {~+~N", p); generate_setup_context(g); if (q->used != q->definition) { // This external needs to be callable as a routine, so generate // the actual code like a routine with an external which just // forwards to that. writef(g, "~Mreturn ~W(env, context);~N", p); w(g, "~-~M}~N"); writef(g, "~Mfn ~W(env: &mut SnowballEnv, context: &mut Context) -> bool {~+~N", p); } } if (q->amongvar_needed) { w(g, "~Mlet mut among_var;~N"); } /* Declare localised variables. */ for (struct name * name = g->analyser->names; name; name = name->next) { if (name->local_to == q) { switch (name->type) { case t_string: // String variables not localised for Rust currently. w(g, "~Mlet mut "); write_varname(g, name); w(g, " : String;~N"); break; case t_integer: w(g, "~Mlet mut "); write_varname(g, name); w(g, " : i32;~N"); break; case t_boolean: w(g, "~Mlet mut "); write_varname(g, name); w(g, " : bool;~N"); break; } } } g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (p->left->possible_signals) { generate(g, p->left->right); } } w(g, "~-~M}~N"); } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; int block = -1; unsigned int bitmap = 0; struct amongvec * among_cases = x->b; int empty_case = -1; int n_cases = 0; symbol cases[2]; int shortest_size = x->shortest_size; int block_opened = 0; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; g->I[1] = x->literalstring_count; /* In forward mode with non-ASCII UTF-8 characters, the first byte * of the string will often be the same, so instead look at the last * common byte position. * * In backward mode, we can't match if there are fewer characters before * the current position than the minimum length. */ for (int c = 0; c < x->literalstring_count; ++c) { symbol ch; if (among_cases[c].size == 0) { empty_case = c; continue; } if (p->mode == m_forward) { ch = among_cases[c].b[shortest_size - 1]; } else { ch = among_cases[c].b[among_cases[c].size - 1]; } if (n_cases == 0) { block = ch >> 5; } else if (ch >> 5 != block) { block = -1; if (n_cases > 2) break; } if (block == -1) { if (n_cases > 0 && ch == cases[0]) continue; if (n_cases < 2) { cases[n_cases++] = ch; } else if (ch != cases[1]) { ++n_cases; break; } } else { if ((bitmap & (1u << (ch & 0x1f))) == 0) { bitmap |= 1u << (ch & 0x1f); if (n_cases < 2) cases[n_cases] = ch; ++n_cases; } } } int pre_check = (block != -1 || n_cases <= 2); if (pre_check) { char buf[64]; g->I[2] = block; g->I[3] = bitmap; g->I[4] = shortest_size - 1; if (p->mode == m_forward) { checked_snprintf(buf, sizeof(buf), "env.current.as_bytes()[(env.cursor + %d) as usize]", shortest_size - 1); g->S[1] = buf; if (shortest_size == 1) { writef(g, "~Mif (env.cursor >= env.limit", p); } else { writef(g, "~Mif (env.cursor + ~I4 >= env.limit", p); } } else { g->S[1] = "env.current.as_bytes()[(env.cursor - 1) as usize]"; if (shortest_size == 1) { writef(g, "~Mif (env.cursor <= env.limit_backward", p); } else { writef(g, "~Mif (env.cursor - ~I4 <= env.limit_backward", p); } } assert(n_cases > 0); if (n_cases == 1) { g->I[4] = cases[0]; writef(g, " || ~S1 as u8 != ~I4 as u8", p); } else if (n_cases == 2) { g->I[4] = cases[0]; g->I[5] = cases[1]; writef(g, " || (~S1 as u8 != ~I4 as u8 && ~S1 as u8 != ~I5 as u8)", p); } else { writef(g, " || ~S1 as u8 >> 5 != ~I2 as u8 || ((~I3 as i32 >> (~S1 as u8 & 0x1f)) & 1) == 0", p); } write_string(g, ") "); if (empty_case != -1 && !among_cases[empty_case].function) { /* If the among includes the empty string, it can never fail * so not matching the bitmap means we match the empty string. */ g->I[4] = among_cases[empty_case].result; writef(g, "{among_var = ~I4;}~N~Melse ", p); write_block_start(g); block_opened = 1; } else { writef(g, "~f~N", p); } } else { #ifdef OPTIMISATION_WARNINGS printf("Couldn't shortcut among %d\n", x->number); #endif } if (x->amongvar_needed) { writef(g, "~Mamong_var = env.find_among~S0(A_~I0, context);~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } if (block_opened) write_block_end(g); return; } if (pre_check && !x->function_count) { // If all cases are one symbol long (so one byte of UTF-8, one // character long in fixed-width encodings) then we don't need to call // the helper and can just inc/dec the cursor by 1. if (x->longest_size == 1 && !x->always_matches) { if (p->mode == m_forward) { w(g, "~Menv.cursor += 1;~N"); } else { w(g, "~Menv.cursor -= 1;~N"); } // Suppress generating table for this among. x->used = false; return; } } if (x->always_matches) { writef(g, "~Menv.find_among~S0(A_~I0, context);~N", p); } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~Mreturn env.find_among~S0(A_~I0, context) != 0;~N", p); x->node->right = NULL; g->unreachable = true; } else { write_failure_if(g, "env.find_among~S0(A_~I0, context) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mmatch among_var {~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~M~I0 => {~N~+"); generate(g, x->commands[i - 1]); w(g, "~-~M}~N"); g->unreachable = false; } w(g, "~M_ => ()~N"); w(g, "~-~M}~N"); } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~Mreturn !~V;~N", p); } else { writef(g, "~Mreturn ~V;~N", p); } p->right = NULL; g->unreachable = true; return; } if (inverted) { write_failure_if(g, "~V", p); } else { write_failure_if(g, "!~V", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Menv.debug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } /* rustc emits warnings if variables don't match the style guide */ /* (i.e. upper-case for globals, snake case for fields etc.) */ /* To allow warning free compilation of generated code and */ /* consistency with snowball variable namings we allow some kind of warnings here */ static void generate_allow_warnings(struct generator * g) { w(g, "#![allow(non_snake_case)]~N"); w(g, "#![allow(non_upper_case_globals)]~N"); w(g, "#![allow(unused_mut)]~N"); w(g, "#![allow(unused_parens)]~N"); w(g, "#![allow(unused_variables)]~N"); } static void generate_class_begin(struct generator * g) { w(g, "use snowball::SnowballEnv;~N"); if (g->analyser->amongs) { w(g, "use snowball::Among;~N~N"); } } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; g->I[1] = x->literalstring_count; w(g, "~N~Mstatic A_~I0: &'static [Among; ~I1] = &[~N~+"); for (int i = 0; i < x->literalstring_count; i++) { g->I[0] = v[i].i; g->I[1] = v[i].result; w(g, "~MAmong("); write_literal_string(g, v[i].b); w(g, ", ~I0, ~I1, "); if (v[i].function != NULL) { w(g, "Some(&"); write_varname(g, v[i].function); w(g, ")"); } else { w(g, "None"); } w(g, "),~N"); } w(g, "~-~M];~N"); } static void generate_amongs(struct generator * g) { struct str * s = g->outbuf; g->outbuf = g->declarations; for (struct among * x = g->analyser->amongs; x; x = x->next) { if (x->used) generate_among_table(g, x); } g->outbuf = s; } static void set_bit(symbol * b, int i) { b[i >> 3] |= 1 << (i & 7); } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7) / 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->I[0] = size; w(g, "~N~Mstatic "); write_varname(g, q->name); w(g, ": &'static [u8; ~I0] = &["); for (int i = 0; i < size; i++) { if (i) w(g, ", "); write_int(g, map[i]); } w(g, "];~N"); lose_b(map); } static void generate_groupings(struct generator * g) { struct str * s = g->outbuf; g->outbuf = g->declarations; for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } g->outbuf = s; } static void generate_members(struct generator * g) { w(g, "#[derive(Clone)]~N"); w(g, "struct Context {~+~N"); for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: write_margin(g); write_varname(g, q); w(g, ": String,~N"); break; case t_integer: write_margin(g); write_varname(g, q); w(g, ": i32,~N"); break; case t_boolean: write_margin(g); write_varname(g, q); w(g, ": bool,~N"); break; } } w(g, "~-}~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_rust(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "//! ", NULL); generate_allow_warnings(g); if (g->analyser->int_limits_used) { /* std::i32 is used in the code generated for i32::MAX and i32::MIN */ w(g, "use std::i32;~N~N"); } generate_class_begin(g); generate_members(g); g->declarations = g->outbuf; g->outbuf = str_new(); generate_methods(g); generate_amongs(g); generate_groupings(g); output_str(g->options->output_src, g->declarations); str_delete(g->declarations); output_str(g->options->output_src, g->outbuf); str_delete(g->outbuf); str_delete(g->failure_str); } snowball-3.1.0/compiler/generator_zig.c000066400000000000000000001266451520373054300201710ustar00rootroot00000000000000#include #include /* for fprintf etc */ #include /* for exit */ #include /* for strlen */ #include "header.h" /* prototype functions for recursive use: */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); /* Write routines for items from the syntax tree */ static void write_relop(struct generator * g, int relop) { // Relational operators are the same as C. write_c_relop(g, relop); } static void write_varname(struct generator * g, struct name * p) { // We use the same naming scheme for both global and local variables. write_char(g, "SbirrG"[p->type]); write_char(g, '_'); write_s(g, p->s); } /* Reference to variable, e.g. when assigning to or using in an expression. */ static void write_varref(struct generator * g, struct name * p) { if (p->local_to == NULL) { write_string(g, "context."); } write_varname(g, p); } static void write_stringref(struct generator * g, struct name * p) { assert(p->type == t_string); write_varref(g, p); write_string(g, ".slice()"); } static void write_literal_string(struct generator * g, symbol * p) { int i = 0; write_char(g, '"'); while (i < SIZE(p)) { int ch; i += get_utf8(p + i, &ch); // Write out ASCII and lower Unicode printables as literal characters. // Use escapes for anything over 0x590 as a crude way to avoid LTR // characters affecting the rendering of source character order in // confusing ways. if ((32 <= ch && ch < 127) || (0xa0 < ch && ch < 0x590)) { if (ch == '"' || ch == '\\') write_char(g, '\\'); write_wchar_as_utf8(g, ch); } else { // Zig Unicode escape are variable width: \u{XXXX} write_string(g, "\\u{"); write_hex(g, ch); write_char(g, '}'); } } write_char(g, '"'); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p, NULL); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~+{~N"); } static void write_block_end(struct generator * g) { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "env.limit - "; writef(g, "~Mconst ~B0 = ~S1env.cursor;~N", p); } static void append_restore_string(struct node * p, struct str * out, struct str * savevar) { str_append_string(out, "env.cursor = "); if (p->mode != m_forward) str_append_string(out, "env.limit - "); str_append(out, savevar); str_append_ch(out, ';'); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); append_restore_string(p, g->outbuf, savevar); write_newline(g); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "env.nextChar();" : "env.prevChar();"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { g->I[0] = n; w(g, "~Mlab~I0: while (true) {~N~+"); } static void wsetlab_end(struct generator * g, int n) { if (!g->unreachable) { g->I[0] = n; w(g, "~Mbreak :lab~I0;~N"); } w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { g->I[0] = n; w(g, "~Mbreak :lab~I0;~N"); g->unreachable = true; } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } switch (g->failure_label) { case x_return: w(g, "~Mreturn false;"); break; default: w(g, "~Mbreak :lab"); write_int(g, g->failure_label); write_char(g, ';'); } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, const char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ") ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "env.cursor >= env.limit", p); } else { write_failure_if(g, "env.cursor <= env.limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; while (input[i]) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } ch = input[i++]; switch (ch) { case '~': write_char(g, '~'); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->S) / sizeof(g->S[0]))) { printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } write_string(g, g->S[j]); continue; } case 'B': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->B) / sizeof(g->B[0]))) goto invalid_escape2; write_s(g, g->B[j]); continue; } case 'I': { int j = input[i++] - '0'; if (j < 0 || j > (int)(sizeof(g->I) / sizeof(g->I[0]))) goto invalid_escape2; write_int(g, g->I[j]); continue; } case 'E': { // Write an external name. if (g->options->externals_prefix) { write_string(g, g->options->externals_prefix); } write_s(g, p->name->s); continue; } case 'V': write_varref(g, p->name); continue; case 'W': write_varname(g, p->name); continue; case 'L': write_literal_string(g, p->literalstring); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_s(g, g->options->name); continue; default: printf("Invalid escape sequence ~%c in writef(g, \"%s\", p)\n", ch, input); exit(1); invalid_escape2: printf("Invalid escape sequence ~%c%c in writef(g, \"%s\", p)\n", ch, input[i - 1], input); exit(1); } } } static void w(struct generator * g, const char * s) { writef(g, s, NULL); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "snowball.MaxInt"); break; case c_minint: write_string(g, "snowball.MinInt"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_divide: write_string(g, "@divTrunc"); s = ", "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "@as(i32, @intCast(env.cursor))"); break; case c_limit: w(g, p->mode == m_forward ? "@as(i32, @intCast(env.limit))" : "@as(i32, @intCast(env.limit_backward))"); break; case c_len: w(g, "snowball.runeCountInString(env.getCurrent())"); break; case c_size: w(g, "@as(i32, @intCast(env.getCurrent().len))"); break; case c_lenof: write_string(g, "snowball.runeCountInString("); write_stringref(g, p->name); write_char(g, ')'); break; case c_sizeof: write_string(g, "@as(i32, @intCast("); write_stringref(g, p->name); write_string(g, ".len))"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_and(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (savevar && p->right != NULL) write_restorecursor(g, p, savevar); p = p->right; } if (savevar) { str_delete(savevar); } } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_for_or(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); bool end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (savevar && K_needed_node_on_f(p)) { write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == NULL) { /* p should never be NULL after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != NULL) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g, label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g, label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); } p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } if (savevar) { str_delete(savevar); } } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g,"~Menv.limit_backward = env.cursor;~N" "~Menv.cursor = env.limit;~N", p); generate(g, p->left); w(g, "~Menv.cursor = env.limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed_node_on_f(p->left)) { savevar = vars_newname(g); } int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g, label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); append_restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; if (savevar) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~M~V = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } generate(g, p->left); if (savevar) { if (!g->unreachable) { write_restorecursor(g, p, savevar); } str_delete(savevar); } } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = NULL; if (K_needed(p->left)) { savevar = vars_newname(g); } write_comment(g, p); if (savevar) { write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); writef(g, "~M_ = ~W(env, @ptrCast(context));~N", p->left); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; } if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "B"; g->S[1] = complement ? "In" : "Out"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!env.go~S1Grouping~S0(&~W, ~I0, ~I1)", p); if (!is_goto) { w(g, p->mode == m_forward ? "~Menv.nextChar();~N" : "~Menv.prevChar();~N"); } } static void generate_GO(struct generator * g, struct node * p, int is_goto) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); bool end_unreachable = false; int golab = new_label(g); g->I[0] = golab; w(g, "~Mgolab~I0: while (true) {~N~+"); struct str * savevar = NULL; if (is_goto || repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (is_goto) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak :golab~I0;~N"); g->unreachable = true; } wsetlab_end(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~M{~N~+~Mvar ~B0: i32 = "); generate_AE(g, p->AE); w(g, ";~N"); w(g, "~Mwhile (~B0 > 0) {~N~+"); w(g, "~M~B0 -= 1;~N"); generate(g, p->left); w(g, "~-~M}~N"); w(g, "~-~M}~N"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { int replab = new_label(g); g->I[0] = replab; writef(g, "~Mreplab~I0: while (true) {~N~+", p); struct str * savevar = NULL; if (repeat_restore(p->left)) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable) { if (loopvar != NULL) { g->B[0] = str_data(loopvar); w(g, "~M~B0 -= 1;~N"); } g->I[0] = replab; w(g, "~Mcontinue :replab~I0;~N"); g->unreachable = true; } wsetlab_end(g, g->failure_label); g->unreachable = false; if (savevar) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->I[0] = replab; w(g, "~Mbreak :replab~I0;~N~-~M}~N"); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { write_comment(g, p); struct str * loopvar = vars_newname(g); g->B[0] = str_data(loopvar); w(g, "~Mvar ~B0: i32 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); str_delete(loopvar); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (@as(i32, @intCast(env.cursor)) ~S0 "); generate_AE(g, p->AE); writef(g, ") ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Menv.cursor = @intCast(@as(u32, @intCast("); generate_AE(g, p->AE); writef(g, ")));~N", p); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); // Generate the AE to a temporary block so we can substitute it in // write_failure_if(). struct str * ae = str_new(); struct str * s = g->outbuf; g->outbuf = ae; generate_AE(g, p->AE); g->outbuf = s; g->B[0] = str_data(ae); g->S[0] = p->mode == m_forward ? "" : "Back"; g->S[1] = p->AE->type == c_number ? "" : "Checked"; write_failure_if(g, "!env.hop~S0~S1(~B0)", p); str_delete(ae); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Menv.sliceDel() catch return false;~N", p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.limit" : "env.limit_backward"; writef(g, "~Menv.cursor = ~S0;~N", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.bra" : "env.ket"; writef(g, "~M~S0 = env.cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.ket" : "env.bra"; writef(g, "~M~S0 = env.cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); write_margin(g); if (p->name->local_to == NULL) { write_string(g, "context."); } write_varname(g, p->name); write_string(g, ".assign(env.allocator, env.assignTo()) catch return false;"); write_newline(g); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); write_margin(g); if (p->name->local_to == NULL) { write_string(g, "context."); } write_varname(g, p->name); write_string(g, ".assign(env.allocator, env.sliceTo()) catch return false;"); write_newline(g); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != NULL) { write_literal_string(g, b); } else { write_stringref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { write_margin(g); write_block_start(g); write_comment(g, p); int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~Mconst c = env.cursor;~N"); w(g, "~Mconst bra = env.cursor;~N"); w(g, "~Mconst ket = env.cursor;~N"); writef(g, "~Menv.insert(bra, ket, ", p); generate_address(g, p); writef(g, ") catch return false;~N", p); if (keep_c) w(g, "~Menv.cursor = c;~N"); write_block_end(g); } static void generate_stringassign(struct generator * g, struct node * p) { write_margin(g); write_block_start(g); write_comment(g, p); int keep_c = p->mode == m_forward; if (keep_c) { writef(g, "~Mconst c = env.cursor;~N", p); } if (p->mode == m_forward) { writef(g, "~Menv.insert(env.cursor, env.limit, ", p); } else { writef(g, "~Menv.insert(env.limit_backward, env.cursor, ", p); } generate_address(g, p); writef(g, ") catch return false;~N", p); if (keep_c) { w(g, "~Menv.cursor = c;~N"); } write_block_end(g); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Menv.sliceFrom("); generate_address(g, p); writef(g, ") catch return false;~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { write_comment(g, p); struct str * varname = vars_newname(g); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; write_comment(g, q); g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (@as(i32, @intCast(env.cursor)) ~S0 "); generate_AE(g, q->AE); w(g, ") "); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mvar ~B0 = env.limit;~N"); w(g, "~Menv.limit = @intCast(@as(u32, @intCast("); generate_AE(g, q->AE); w(g, ")));~N"); w(g, "~M~B0 -= env.limit;~N"); } else { w(g, "~Mconst ~B0 = env.limit_backward;~N"); w(g, "~Menv.limit_backward = @intCast(@as(u32, @intCast("); generate_AE(g, q->AE); w(g, ")));~N"); } if (p->mode == m_forward) { str_assign(g->failure_str, "env.limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "env.limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { struct str * savevar = vars_newname(g); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mconst ~B0 = env.limit - env.cursor;~N"); w(g, "~Menv.limit = env.cursor;~N"); } else { w(g, "~Mconst ~B0 = env.limit_backward;~N"); w(g, "~Menv.limit_backward = env.cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "env.limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "env.limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } str_delete(savevar); } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { write_comment(g, p); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~Mvar ~B0 = env.clone() catch return false;~N", p); write_margin(g); write_string(g, "env.setCurrent("); write_stringref(g, p->name); write_string(g, ") catch return false;"); write_newline(g); if (p->left->possible_signals == -1) { /* Assume failure. */ w(g, "~Mvar ~B0_f = true;~N"); } wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable && p->left->possible_signals == -1) { /* Mark success. */ g->B[0] = str_data(savevar); w(g, "~M~B0_f = false;~N"); } wsetlab_end(g, g->failure_label); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; g->B[0] = str_data(savevar); write_margin(g); if (p->name->local_to == NULL) { write_string(g, "context."); } write_varname(g, p->name); write_string(g, ".assign(env.allocator, env.getCurrent()) catch return false;"); write_newline(g); writef(g, "~Menv.copyFrom(&~B0) catch return false;~N", p); if (p->left->possible_signals == 0) { // p->left always signals f. write_failure(g); } else if (p->left->possible_signals == -1) { write_failure_if(g, "~B0_f", p); } str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, const char * s) { write_comment(g, p); g->S[0] = s; writef(g, "~M~V ~S0 ", p); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p) { write_comment(g, p); int relop = p->type; int optimise_to_return = tailcallable(g, p); if (optimise_to_return) { w(g, "~Mreturn "); p->right = NULL; } else { w(g, "~Mif ("); // We want the inverse of the snowball test here. relop ^= 1; } generate_AE(g, p->left); write_relop(g, relop); generate_AE(g, p->AE); if (optimise_to_return) { w(g, ";~N"); g->unreachable = true; } else { write_string(g, ") "); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_call(struct generator * g, struct node * p) { int signals = p->name->definition->possible_signals; write_comment(g, p); if (tailcallable(g, p)) { /* Tail call. */ writef(g, "~Mreturn ~W(env, @ptrCast(context));~N", p); p->right = NULL; g->unreachable = true; return; } if (just_return_on_fail(g) && signals == 0) { /* Always fails. */ writef(g, "~Mreturn ~W(env, @ptrCast(context));~N", p); g->unreachable = true; return; } if (signals == 1) { /* Always succeeds. */ writef(g, "~M_ = ~W(env, @ptrCast(context));~N", p); } else if (signals == 0) { /* Always fails. */ writef(g, "~M_ = ~W(env, @ptrCast(context));~N", p); write_failure(g); } else { write_failure_if(g, "!~W(env, @ptrCast(context))", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { write_comment(g, p); struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "B"; g->S[1] = complement ? "out" : "in"; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (tailcallable(g, p)) { writef(g, "~Mreturn env.~S1Grouping~S0(&~W, ~I0, ~I1);~N", p); p->right = NULL; g->unreachable = true; } else { write_failure_if(g, "!env.~S1Grouping~S0(&~W, ~I0, ~I1)", p); } } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "B"; if (tailcallable(g, p)) { write_margin(g); write_string(g, "return env.eqS"); write_string(g, g->S[0]); write_char(g, '('); write_stringref(g, p->name); write_string(g, ");"); write_newline(g); p->right = NULL; g->unreachable = true; } else { write_margin(g); write_string(g, "if (!env.eqS"); write_string(g, g->S[0]); write_char(g, '('); write_stringref(g, p->name); write_string(g, ")) "); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } } static void generate_literalstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "B"; if (tailcallable(g, p)) { writef(g, "~Mreturn env.eqS~S0(~L);~N", p); p->right = NULL; g->unreachable = true; } else { write_failure_if(g, "!env.eqS~S0(~L)", p); } } static void generate_cast_context(struct generator * g) { w(g, "~Mconst context: *Context = @ptrCast(@alignCast(ctx));~N"); w(g, "~Msuppress_any_unused_warning(@as(*anyopaque, @ptrCast(context)));~N"); } static void generate_setup_context(struct generator * g) { w(g, "~Mvar context_val = Context{};~N"); w(g, "~Mconst context = &context_val;~N"); w(g, "~Msuppress_any_unused_warning(@as(*anyopaque, @ptrCast(context)));~N"); for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to != NULL || q->type != t_string) continue; w(g, "~Mdefer context."); write_varname(g, q); w(g, ".deinit(env.allocator);~N"); } } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; write_newline(g); write_comment(g, p); if (q->type == t_routine) { writef(g, "~Mfn ~W(env: *snowball.Env, ctx: *anyopaque) bool {~+~N", p); generate_cast_context(g); } else { writef(g, "~Mpub fn ~E(env: *snowball.Env) bool {~+~N", p); generate_setup_context(g); if (q->used != q->definition) { // This external needs to be callable as a routine, so generate // the actual code like a routine with an external which just // forwards to that. writef(g, "~Mreturn ~W(env, @as(*anyopaque, @ptrCast(context)));~N", p); w(g, "~-~M}~N~N"); writef(g, "~Mfn ~W(env: *snowball.Env, ctx: *anyopaque) bool {~+~N", p); generate_cast_context(g); } } if (q->amongvar_needed) { w(g, "~Mvar among_var: i32 = 0;~N"); } /* Declare localised variables. */ for (struct name * name = g->analyser->names; name; name = name->next) { if (name->local_to == q) { switch (name->type) { case t_string: // String variables not localised for Zig currently. w(g, "~Mvar "); write_varname(g, name); w(g, ": snowball.String = .{};~N"); w(g, "~Mdefer "); write_varname(g, name); w(g, ".deinit(env.allocator);~N"); break; case t_integer: w(g, "~Mvar "); write_varname(g, name); w(g, ": i32 = 0;~N"); break; case t_boolean: w(g, "~Mvar "); write_varname(g, name); w(g, ": bool = false;~N"); break; } } } g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ generate(g, p->left); if (p->left->right) { assert(p->left->right->type == c_functionend); if (p->left->possible_signals) { generate(g, p->left->right); } } w(g, "~-~M}~N"); } static void generate_functionend(struct generator * g, struct node * p) { (void)p; w(g, "~Mreturn true;~N"); } static void generate_substring(struct generator * g, struct node * p) { write_comment(g, p); struct among * x = p->among; g->S[0] = p->mode == m_forward ? "" : "B"; g->I[0] = x->number; if (x->amongvar_needed) { writef(g, "~Mamong_var = env.findAmong~S0(&a_~I0, @as(*anyopaque, @ptrCast(context)));~N", p); if (!x->always_matches) { write_failure_if(g, "among_var == 0", p); } } else if (x->always_matches) { writef(g, "~M_ = env.findAmong~S0(&a_~I0, @as(*anyopaque, @ptrCast(context)));~N", p); } else if (x->command_count == 0 && tailcallable(g, p)) { writef(g, "~Mreturn env.findAmong~S0(&a_~I0, @as(*anyopaque, @ptrCast(context))) != 0;~N", p); x->node->right = NULL; g->unreachable = true; } else { write_failure_if(g, "env.findAmong~S0(&a_~I0, @as(*anyopaque, @ptrCast(context))) == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == NULL) { generate_substring(g, p); } else { write_comment(g, p); } if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { w(g, "~Mswitch (among_var) {~N~+"); for (int i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~M~I0 => {~N~+"); generate(g, x->commands[i - 1]); w(g, "~-~M},~N"); g->unreachable = false; } w(g, "~Melse => {},~N"); w(g, "~-~M}~N"); } } static void generate_booltest(struct generator * g, struct node * p, int inverted) { write_comment(g, p); if (tailcallable(g, p)) { // Optimise at end of function. if (inverted) { writef(g, "~Mreturn !~V;~N", p); } else { writef(g, "~Mreturn ~V;~N", p); } p->right = NULL; g->unreachable = true; return; } if (inverted) { write_failure_if(g, "~V", p); } else { write_failure_if(g, "!~V", p); } } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Menv.debug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { if (g->unreachable) return; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_goto_grouping: generate_GO_grouping(g, p, 1, 0); break; case c_gopast_grouping: generate_GO_grouping(g, p, 0, 0); break; case c_goto_non: generate_GO_grouping(g, p, 1, 1); break; case c_gopast_non: generate_GO_grouping(g, p, 0, 1); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_stringassign: generate_stringassign(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_assign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: write_comment(g, p); writef(g, "~M~V = @divTrunc(~V, ", p); generate_AE(g, p->AE); w(g, ");~N"); break; case c_eq: case c_ne: case c_gt: case c_ge: case c_lt: case c_le: generate_integer_test(g, p); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p, false); break; case c_not_booltest: generate_booltest(g, p, true); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; case c_functionend: generate_functionend(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "const snowball = @import(\"env.zig\");~N~N" "fn suppress_any_unused_warning(ctx: *anyopaque) void {~N~+" "~M_ = ctx;~N" "~-~M}~N~N"); } static void generate_among_table(struct generator * g, struct among * x) { write_comment(g, x->node); struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Mconst a_~I0 = [_]snowball.Among{~N~+"); for (int i = 0; i < x->literalstring_count; i++) { g->I[0] = v[i].i; g->I[1] = v[i].result; w(g, "~Msnowball.Among{ .s = "); write_literal_string(g, v[i].b); w(g, ", .substring_i = ~I0, .result = ~I1, .method = "); if (v[i].function != NULL) { write_varname(g, v[i].function); } else { w(g, "null"); } w(g, " },~N"); } w(g, "~-~M};~N~N"); } static void generate_amongs(struct generator * g) { for (struct among * x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i >> 3] |= 1 << (i & 7); } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7) / 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); for (int i = 0; i < size; i++) map[i] = 0; for (int i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); w(g, "~Mconst "); write_varname(g, q->name); w(g, " = [_]u8{"); for (int i = 0; i < size; i++) { if (i) w(g, ", "); write_int(g, map[i]); } w(g, "};~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { for (struct grouping * q = g->analyser->groupings; q; q = q->next) { generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { w(g, "const Context = struct {~+~N"); for (struct name * q = g->analyser->names; q; q = q->next) { if (q->local_to) continue; switch (q->type) { case t_string: write_margin(g); write_varname(g, q); w(g, ": snowball.String = .{},~N"); break; case t_integer: write_margin(g); write_varname(g, q); w(g, ": i32 = 0,~N"); break; case t_boolean: write_margin(g); write_varname(g, q); w(g, ": bool = false,~N"); break; } } w(g, "~-};~N"); } static void generate_methods(struct generator * g) { for (struct node * p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_zig(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "//! ", NULL); generate_class_begin(g); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-3.1.0/compiler/header.h000066400000000000000000000446551520373054300165670ustar00rootroot00000000000000#include #if __STDC_VERSION__ < 202311l # include #endif #define SNOWBALL_VERSION "3.1.0" typedef unsigned char byte; typedef unsigned short symbol; #define MALLOC check_malloc #define FREE check_free // Declare variable `V` of type `struct TYPE *` and dynamically allocate it. // We exit on allocation failure so `V` is always non-NULL. #define NEW(TYPE, V) struct TYPE * V = (struct TYPE *) MALLOC(sizeof(struct TYPE)) // Similar to NEW() but allocates an array of N objects of type `struct TYPE *`. #define NEWVEC(TYPE, V, N) struct TYPE * V = (struct TYPE *) MALLOC(sizeof(struct TYPE) * (N)) #define SIZE(p) ((const int *)(p))[-1] #define SET_SIZE(p, n) ((int *)(p))[-1] = (n) #define ADD_TO_SIZE(p, n) ((int *)(p))[-1] += (n) #define CAPACITY(p) ((int *)(p))[-2] extern symbol * create_b(int n); extern void report_b(FILE * out, const symbol * p); extern void lose_b(symbol * p); extern symbol * increase_capacity_b(symbol * p, int n); extern symbol * add_to_b(symbol * p, const symbol * q, int n); extern symbol * copy_b(const symbol * p); extern char * b_to_sz(const symbol * p); extern symbol * add_symbol_to_b(symbol * p, symbol ch); // These routines are like those above but work in byte instead of symbol. extern byte * create_s(int n); extern byte * create_s_from_sz(const char * s); extern byte * create_s_from_data(const char * s, int n); extern void report_s(FILE * out, const byte * p); extern void lose_s(byte * p); extern byte * increase_capacity_s(byte * p, int n); extern byte * ensure_capacity_s(byte * p, int n); extern byte * copy_s(const byte * p); extern byte * add_s_to_s(byte * p, const byte * s); extern byte * add_slen_to_s(byte * p, const char * s, int n); extern byte * add_sz_to_s(byte * p, const char * s); extern byte * add_char_to_s(byte * p, char ch); // "" LIT is a trick to make compilation fail if LIT is not a string literal. #define add_literal_to_s(P, LIT) add_slen_to_s(P, "" LIT, sizeof(LIT) - 1) struct str; /* defined in space.c */ extern struct str * str_new(void); extern void str_delete(struct str * str); extern void str_append(struct str * str, const struct str * add); extern void str_append_ch(struct str * str, char add); extern void str_append_s(struct str * str, const byte * q); extern void str_append_string(struct str * str, const char * s); extern void str_append_int(struct str * str, int i); extern void str_append_wchar_as_utf8(struct str * str, symbol ch); extern void str_clear(struct str * str); extern void str_assign(struct str * str, const char * s); extern struct str * str_copy(const struct str * old); extern byte * str_data(const struct str * str); extern int str_len(const struct str * str); extern int str_back(const struct str *str); extern void str_pop(const struct str *str); extern void str_pop_n(const struct str *str, int n); extern void output_str(FILE * outfile, struct str * str); extern int get_utf8(const symbol * p, int * slot); extern int put_utf8(int ch, symbol * p); typedef enum { ENC_SINGLEBYTE = 0, ENC_UTF8, ENC_WIDECHARS } enc; /* stringdef name and value */ struct m_pair { struct m_pair * next; byte * name; symbol * value; }; /* struct input must be a prefix of struct tokeniser. */ struct input { struct input * next; byte * p; int c; char * file; // -1 : Release file with: lose_s((byte *)file) // 0 : We don't own file. // 1 : Release file with: free(file) int file_owned; int line_number; }; struct include { struct include * next; byte * s; }; enum token_codes { /* The relational operator token values are chosen such that we can * invert the relation with a simple xor with 1. */ c_gt = 0, c_le, c_ge, c_lt, c_eq, c_ne, /* Other token values just need to be unique. */ c_among, c_and, c_as, c_assign, c_assignto, c_atleast, c_atlimit, c_atmark, c_attach, c_backwardmode, c_backwards, c_booleans, c_bra, c_comment1, c_comment2, c_cursor, c_debug, c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do, c_dollar, c_externals, c_fail, c_false, c_for, c_get, c_gopast, c_goto, c_groupings, c_hex, c_hop, c_insert, c_integers, c_ket, c_leftslice, c_len, c_lenof, c_limit, c_loop, c_maxint, c_minint, c_minus, c_minusassign, c_multiply, c_multiplyassign, c_next, c_non, c_not, c_or, c_plus, c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines, c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom, c_sliceto, c_stringdef, c_stringescapes, c_strings, c_substring, c_test, c_tolimit, c_tomark, c_true, c_try, c_unset, /* These token values don't directly correspond to a keyword. */ c_name, c_number, c_literalstring, /* These token values are synthesised by the analyser. */ c_stringassign, c_neg, c_call, c_grouping, c_booltest, c_functionend, c_goto_grouping, c_gopast_grouping, c_goto_non, c_gopast_non, c_not_booltest, NUM_TOKEN_CODES }; enum uplus_modes { UPLUS_NONE = 0, UPLUS_DEFINED, UPLUS_UNICODE }; /* struct input must be a prefix of struct tokeniser. */ struct tokeniser { struct input * next; byte * p; int c; char * file; // -1 : Release file with: lose_s((byte *)file) // 0 : We don't own file. // 1 : Release file with: free(file) int file_owned; int line_number; // Used for c_literalstring values. symbol * b; // Used for c_name names. byte * s; int number; // String escape start character or -1. int m_start; // String escape end character. int m_end; // Link list of stringdefs. struct m_pair * m_pairs; // Nesting depth of get directives. int get_depth; int error_count; int token; int previous_token; bool token_held; bool token_reported_as_unexpected; enc encoding; struct include * includes; /* Mode in which U+ has been used: * UPLUS_NONE - not used yet * UPLUS_DEFINED - stringdef U+xxxx .... * UPLUS_UNICODE - {U+xxxx} used with implicit meaning */ int uplusmode; char token_disabled[NUM_TOKEN_CODES]; }; extern byte * get_input(const char * filename); extern struct tokeniser * create_tokeniser(byte * b, char * file); extern int read_token(struct tokeniser * t); extern int peek_token(struct tokeniser * t); #define hold_token(T) ((T)->token_held = true) extern const char * name_of_token(int code); extern void disable_token(struct tokeniser * t, int code); extern void close_tokeniser(struct tokeniser * t); extern int space_count; extern void * check_malloc(size_t n); extern void check_free(void * p); extern int checked_snprintf(char *str, size_t size, const char *restrict format, ...) #ifdef __GNUC__ __attribute__ ((__format__ (__printf__, 3, 4))) #endif ; struct node; struct name { struct name * next; byte * s; byte type; /* t_string etc */ byte mode; /* for routines, externals (m_forward, etc) */ bool value_used; /* (For variables) is its value ever used? */ bool initialised; /* (For variables) is it ever initialised? */ bool used_in_definition; /* (grouping) used in grouping definition? */ bool amongvar_needed; /* for routines, externals */ bool among_with_function; /* (routines/externals) contains among with func */ bool case_collision; /* A name of the same type differs only by case */ struct node * definition; /* (routines/externals) c_define node */ int used_in_among; /* (routines/externals) Count of uses in amongs */ // Initialised to -1; set to -2 if reachable from an external. // Reachable names are then numbered 0, 1, 2, ... with separate numbering // per type. int count; // Number of references to this name. int references; struct grouping * grouping; /* for grouping names */ struct node * used; /* First use, or NULL if not used */ struct name * local_to; /* Local to one routine/external */ int among_index; /* for functions used in among */ int declaration_line_number;/* Line number of declaration */ }; struct literalstring { struct literalstring * next; symbol * b; }; struct amongvec { symbol * b; /* the string giving the case */ int size; /* - and its size */ struct node * action; /* the corresponding action */ int i; /* the amongvec index of the longest substring of b */ int result; /* the numeric result for the case */ int line_number; /* for diagnostics */ int function_index; /* 1-based */ // 0-based index giving order of strings in source. Used for stable // sorting of amongvec entries and -coverage. int string_index; struct name * function; }; struct among { struct among * next; struct amongvec * b; /* pointer to the amongvec */ int number; /* amongs are numbered 0, 1, 2 ... */ int literalstring_count; /* in this among */ int command_count; /* in this among (excludes "no command" entries) */ int nocommand_count; /* number of "no command" entries in this among */ int function_count; /* number of different functions in this among */ bool amongvar_needed; /* do we need to set among_var? */ bool always_matches; /* will this among always match? */ bool used; /* is this among in reachable code? */ int same_action; /* type code if same for all actions; <0 otherwise */ int shortest_size; /* smallest non-zero string length in this among */ int longest_size; /* longest string length in this among */ struct node * substring; /* i.e. substring ... among ( ... ) */ struct node ** commands; /* array with command_count entries */ struct node * node; /* pointer to the node for this among */ struct name * in_routine; /* pointer to name for routine this among is in */ }; struct grouping { struct grouping * next; symbol * b; /* the characters of this group */ int largest_ch; /* character with max code */ int smallest_ch; /* character with min code */ struct name * name; /* so g->name->grouping == g */ int line_number; }; struct node { struct node * next; struct node * left; struct node * aux; /* used in setlimit */ struct among * among; /* used in among */ struct node * right; byte type; byte mode; // We want to distinguish constant AEs which have the same value everywhere // (e.g. 42, 2+2, lenof '{U+0246}') from constant AEs which can have a // different value depending on platform and/or target language and/or // Unicode mode (e.g. maxint, sizeof '{U+0246}') - some warnings which // depend on a constant AE's value should only fire for the first set. bool fixed_constant; // Return 0 for always f. // Return 1 for always t. // Return -1 for don't know (or can raise t or f). signed char possible_signals; struct node * AE; struct name * name; symbol * literalstring; int number; int line_number; }; enum name_types { t_size = 6, t_string = 0, t_boolean = 1, t_integer = 2, t_routine = 3, t_external = 4, t_grouping = 5 /* If this list is extended, adjust write_varname in generator.c */ }; struct analyser { struct tokeniser * tokeniser; struct node * nodes; struct name * names; struct literalstring * literalstrings; byte mode; bool modifyable; /* false inside reverse(...) */ struct node * program; struct node * program_end; /* name_count[i] counts the number of names of type i, where i is an enum * name_types value. These counts *EXCLUDE* localised variables and * variables which optimised away (e.g. declared but never used). */ int name_count[t_size]; /* name_count[t_string] + name_count[t_boolean] + name_count[t_integer] */ int variable_count; struct among * amongs; struct among * amongs_end; int among_with_function_count; /* number of amongs with functions */ struct grouping * groupings; struct grouping * groupings_end; struct node * substring; /* pending 'substring' in current routine definition */ struct name * current_routine; /* routine/external we're currently on. */ enc encoding; bool int_limits_used; /* are maxint or minint used? */ bool debug_used; /* is the '?' command used? */ }; enum analyser_modes { // m_unknown is used as the initial value for struct node's mode member. // When a routine (or external) is used or defined we check the mode // member matches, but for the first use/definition we see we want to // instead set it to the mode of that use/definition. m_forward = 0, m_backward, m_unknown }; extern void print_program(struct analyser * a); extern struct analyser * create_analyser(struct tokeniser * t); extern void close_analyser(struct analyser * a); /** Read and analyse the program. * * @param localise_mask bitmask of variable types the generator can localise */ extern void read_program(struct analyser * a, unsigned localise_mask); struct generator { struct analyser * analyser; struct options * options; bool unreachable; /* false if code can be reached, true if current * code is unreachable. */ int var_number; /* Number of next variable to use. */ struct str * outbuf; /* temporary str to store output */ struct str * declarations; /* str storing variable declarations */ int next_label; int max_label; /* Only used by Python */ int margin; /* Target language code to execute in case of failure. */ struct str * failure_str; int label_used; /* Keep track of whether the failure label is used. */ int failure_label; int debug_count; int copy_from_count; /* count of calls to copy_from() */ const char * S[10]; /* strings */ byte * B[10]; /* byte blocks */ int I[10]; /* integers */ int line_count; /* counts number of lines output */ int line_labelled; /* in ISO C, will need extra ';' if it is a block end */ int literalstring_count; int keep_count; /* used to number keep/restore pairs to avoid compiler warnings about shadowed variables */ bool temporary_used; /* track if temporary variable used (Ada and Pascal) */ char java_import_arrays; /* need `import java.util.Arrays;` */ char java_import_chararraysequence; /* need `import org.tartarus.snowball.CharArraySequence;` */ // Prefix for generated variable names (`v_` by default). const char * varname_prefix; // String to indent by for each margin level (four spaces by default). const char * margin_indent; }; /* Special values for failure_label in struct generator. */ enum special_labels { x_return = -1 }; struct options { /* for the command line: */ byte * output_file; // output_file but without any path. byte * output_leaf; // Extension specified in -o option (or NULL if none). byte * extension; byte * name; FILE * output_src; FILE * output_h; bool syntax_tree; bool comments; bool coverage; enc encoding; enum { LANG_C = 0, // We generate C by default. LANG_ADA, LANG_CPLUSPLUS, LANG_CSHARP, LANG_DART, LANG_GO, LANG_JAVA, LANG_JAVASCRIPT, LANG_PASCAL, LANG_PHP, LANG_PYTHON, LANG_RUST, LANG_ZIG } target_lang; const char * externals_prefix; const char * variables_prefix; const char * cheader; const char * hheader; const char * runtime_path; const char * parent_class_name; const char * package; const char * go_snowball_runtime; const char * string_class; const char * among_class; struct include * includes; struct include * includes_end; }; /* Generator functions common to several backends. */ extern struct generator * create_generator(struct analyser * a, struct options * o); extern void close_generator(struct generator * g); static inline int new_label(struct generator * g) { return g->next_label++; } extern struct str * vars_newname(struct generator * g); extern void write_margin(struct generator * g); extern void write_char(struct generator * g, int ch); extern void write_newline(struct generator * g); extern void write_string(struct generator * g, const char * s); extern void write_wchar_as_utf8(struct generator * g, symbol ch); extern void write_int(struct generator * g, int i); extern void wi3(struct generator * g, int i); extern void write_hex4(struct generator * g, unsigned ch); extern void write_hex(struct generator * g, unsigned i); extern void write_symbol(struct generator * g, symbol s); extern void write_s(struct generator * g, const byte * b); extern void write_str(struct generator * g, struct str * str); extern void write_c_relop(struct generator * g, int relop); extern void write_comment_content(struct generator * g, struct node * p, const char * end); extern void write_generated_comment_content(struct generator * g); extern void write_start_comment(struct generator * g, const char * comment_start, const char * comment_end); extern int K_needed(struct node * p); extern int K_needed_node_on_f(struct node * p); extern int K_needed_for_and(struct node * p); extern int K_needed_for_or(struct node * p); extern int repeat_restore(struct node * p); extern int just_return_on_fail(struct generator * g); extern int tailcallable(struct generator * g, struct node * p); /* Generator for C code. */ extern void generate_program_c(struct generator * g); /* Generator for Java code. */ extern void generate_program_java(struct generator * g); /* Generator for Dart code. */ extern void generate_program_dart(struct generator * g); /* Generator for C# code. */ extern void generate_program_csharp(struct generator * g); extern void generate_program_pascal(struct generator * g); extern void generate_program_php(struct generator * g); /* Generator for Python code. */ extern void generate_program_python(struct generator * g); extern void generate_program_js(struct generator * g); extern void generate_program_rust(struct generator * g); extern void generate_program_go(struct generator * g); extern void generate_program_ada(struct generator * g); extern void generate_program_zig(struct generator * g); snowball-3.1.0/compiler/space.c000066400000000000000000000250651520373054300164170ustar00rootroot00000000000000 #include #include #include /* for printf */ #include /* malloc, free */ #include /* memmove */ #include "header.h" #define HEAD 2*sizeof(int) #define EXTENDER 40 /* This module provides a simple mechanism for arbitrary length writable strings, called 'blocks'. They are 'symbol *' items rather than 'char *' items however. The calls are: symbol * b = create_b(n); - create an empty block b with room for n symbols b = increase_capacity_b(b, n); - increase the capacity of block b by n symbols (b may change) b2 = copy_b(b) - copy block b into b2 lose_b(b); - lose block b b = add_to_b(b, p, n); - add the n symbols at address p to the end of the data in b SIZE(b) - is the number of symbols in b For example: symbol * b = create_b(0); for (symbol i = 'A'; i <= 'Z'; i++) { add_symbol_to_b(b, i); } After running the above code b contains: { (symbol)'A', (symbol)'B', ..., (symbol)'Z' } */ /* For a block b, SIZE(b) is the number of symbols so far written into it, CAPACITY(b) the total number it can contain, so SIZE(b) <= CAPACITY(b). In fact blocks have 1 extra character over the promised capacity so they can be zero terminated by 'b[SIZE(b)] = 0;' without fear of overwriting. */ extern symbol * create_b(int n) { symbol * p = (symbol *) (HEAD + (char *) MALLOC(HEAD + (n + 1) * sizeof(symbol))); CAPACITY(p) = n; SET_SIZE(p, 0); return p; } extern void report_b(FILE * out, const symbol * p) { for (int i = 0; i < SIZE(p); i++) { if (p[i] > 255) { printf("In report_b, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); exit(1); } putc(p[i], out); } } extern void output_str(FILE * outfile, struct str * str) { report_s(outfile, str_data(str)); } extern void lose_b(symbol * p) { if (p == NULL) return; FREE((char *) p - HEAD); } extern symbol * increase_capacity_b(symbol * p, int n) { symbol * q = create_b(CAPACITY(p) + n + EXTENDER); memmove(q, p, CAPACITY(p) * sizeof(symbol)); SET_SIZE(q, SIZE(p)); lose_b(p); return q; } extern symbol * add_to_b(symbol * p, const symbol * q, int n) { int x = SIZE(p) + n - CAPACITY(p); if (x > 0) p = increase_capacity_b(p, x); memmove(p + SIZE(p), q, n * sizeof(symbol)); ADD_TO_SIZE(p, n); return p; } extern symbol * copy_b(const symbol * p) { int n = SIZE(p); symbol * q = create_b(n); add_to_b(q, p, n); return q; } int space_count = 0; static void * xmalloc(size_t n) { void * result = malloc(n); if (result == NULL) { fprintf(stderr, "Failed to allocate %lu bytes\n", (unsigned long)n); exit(1); } return result; } extern void * check_malloc(size_t n) { space_count++; return xmalloc(n); } extern void check_free(void * p) { if (p) space_count--; free(p); } extern int checked_snprintf(char *str, size_t size, const char *restrict format, ...) { va_list ap; va_start(ap, format); int r = vsnprintf(str, size, format, ap); va_end(ap); // Some pre-C99 snprintf implementations return -1 if the buffer is too // small so cast to unsigned for a simpler test (we require C99, but better // to be robust if we encounter a C99 compiler with pre-C99 quirks in its // runtime library). if ((unsigned)r >= size) { fprintf(stderr, "snprintf(buf, %zu, \"%s\", ...) would overflow\n", size, format); exit(1); } return r; } /* Convert a block to a zero terminated string. */ extern char * b_to_sz(const symbol * p) { int n = SIZE(p); char * s = (char *)xmalloc(n + 1); for (int i = 0; i < n; i++) { if (p[i] > 255) { printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); exit(1); } s[i] = (char)p[i]; } s[n] = 0; return s; } /* Add a single symbol to a block. If p = 0 the block is created. */ extern symbol * add_symbol_to_b(symbol * p, symbol ch) { if (p == NULL) p = create_b(1); int k = SIZE(p); int x = k + 1 - CAPACITY(p); if (x > 0) p = increase_capacity_b(p, x); p[k] = ch; ADD_TO_SIZE(p, 1); return p; } extern byte * create_s(int n) { byte * p = (byte *) (HEAD + (byte *) MALLOC(HEAD + (n + 1))); CAPACITY(p) = n; SET_SIZE(p, 0); return p; } extern byte * create_s_from_sz(const char * s) { int n = strlen(s); byte * p = create_s(n); memcpy(p, s, n + 1); SET_SIZE(p, n); return p; } extern byte * create_s_from_data(const char * s, int n) { byte * p = create_s(n); memcpy(p, s, n); p[n] = '\0'; SET_SIZE(p, n); return p; } extern void report_s(FILE * out, const byte * p) { fwrite(p, 1, SIZE(p), out); } extern void lose_s(byte * p) { if (p == NULL) return; FREE((byte *) p - HEAD); } extern byte * increase_capacity_s(byte * p, int n) { int new_size = CAPACITY(p) + n + EXTENDER; // Switch to exponential growth for large strings. if (new_size > 512) new_size *= 2; byte * q = create_s(new_size); memmove(q, p, CAPACITY(p)); SET_SIZE(q, SIZE(p)); lose_s(p); return q; } extern byte * ensure_capacity_s(byte * p, int n) { int x = SIZE(p) + n - CAPACITY(p); if (x > 0) p = increase_capacity_s(p, x); return p; } extern byte * copy_s(const byte * p) { return add_s_to_s(NULL, p); } /* Add a string with given length to a byte block. If p = 0 the block is created. */ extern byte * add_slen_to_s(byte * p, const char * s, int n) { if (p == NULL) { p = create_s(n); } else { p = ensure_capacity_s(p, n); } int k = SIZE(p); memcpy(p + k, s, n); SET_SIZE(p, k + n); return p; } /* Add a byte block to a byte block. If p = 0 the block is created. */ extern byte * add_s_to_s(byte * p, const byte * s) { return add_slen_to_s(p, (const char *)s, SIZE(s)); } /* Add a zero terminated string to a byte block. If p = 0 the block is created. */ extern byte * add_sz_to_s(byte * p, const char * s) { return add_slen_to_s(p, s, strlen(s)); } /* Add a single character to a byte block. If p = 0 the block is created. */ extern byte * add_char_to_s(byte * p, char ch) { if (p == NULL) { p = create_s(1); } else { p = ensure_capacity_s(p, 1); } p[SIZE(p)] = ch; ADD_TO_SIZE(p, 1); return p; } /* The next section defines string handling capabilities in terms of the lower level byte block handling capabilities of space.c */ /* -------------------------------------------------------------*/ struct str { byte * data; }; /* Create a new string. */ extern struct str * str_new(void) { struct str * output = (struct str *) xmalloc(sizeof(struct str)); output->data = create_s(0); return output; } /* Delete a string. */ extern void str_delete(struct str * str) { lose_s(str->data); free(str); } /* Append a str to this str. */ extern void str_append(struct str * str, const struct str * add) { str->data = add_s_to_s(str->data, add->data); } /* Append a character to this str. */ extern void str_append_ch(struct str * str, char add) { str->data = add_char_to_s(str->data, add); } /* Append a low level byte block to a str. */ extern void str_append_s(struct str * str, const byte * q) { str->data = add_s_to_s(str->data, q); } /* Append a (char *, null terminated) string to a str. */ extern void str_append_string(struct str * str, const char * s) { str->data = add_sz_to_s(str->data, s); } /* Append an integer to a str. */ extern void str_append_int(struct str * str, int i) { // Most calls are for integers 0 to 9 (~72%). if (i >= 0 && i <= 9) { str_append_ch(str, '0' + i); return; } // Ensure there's enough space then snprintf() directly onto the end. int max_size = (CHAR_BIT * sizeof(int) + 5) / 3; str->data = ensure_capacity_s(str->data, max_size); int r = checked_snprintf((char*)str->data + SIZE(str->data), max_size, "%d", i); ADD_TO_SIZE(str->data, r); } /* Append wide character to a string as UTF-8. */ extern void str_append_wchar_as_utf8(struct str * str, symbol ch) { if (ch < 0x80) { str_append_ch(str, ch); return; } if (ch < 0x800) { str_append_ch(str, (ch >> 6) | 0xC0); str_append_ch(str, (ch & 0x3F) | 0x80); return; } str_append_ch(str, (ch >> 12) | 0xE0); str_append_ch(str, ((ch >> 6) & 0x3F) | 0x80); str_append_ch(str, (ch & 0x3F) | 0x80); } /* Clear a string */ extern void str_clear(struct str * str) { SET_SIZE(str->data, 0); } /* Set a string */ extern void str_assign(struct str * str, const char * s) { str_clear(str); str_append_string(str, s); } /* Copy a string. */ extern struct str * str_copy(const struct str * old) { struct str * newstr = str_new(); str_append(newstr, old); return newstr; } /* Get the data stored in this str. */ extern byte * str_data(const struct str * str) { return str->data; } /* Get the length of the str. */ extern int str_len(const struct str * str) { return SIZE(str->data); } /* Get the last character of the str. * * Or -1 if the string is empty. */ extern int str_back(const struct str *str) { return SIZE(str->data) ? str->data[SIZE(str->data) - 1] : -1; } /* Remove the last character of the str. * * Or do nothing if the string is empty. */ extern void str_pop(const struct str *str) { if (SIZE(str->data)) ADD_TO_SIZE(str->data, -1); } /* Remove the last n characters of the str. * * Or make the string empty if its length is less than n. */ extern void str_pop_n(const struct str *str, int n) { if (SIZE(str->data) > n) { ADD_TO_SIZE(str->data, -n); } else { SET_SIZE(str->data, 0); } } extern int get_utf8(const symbol * p, int * slot) { int b0 = *p++; if (b0 < 0xC0) { /* 1100 0000 */ * slot = b0; return 1; } int b1 = *p++; if (b0 < 0xE0) { /* 1110 0000 */ * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; } * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F); return 3; } extern int put_utf8(int ch, symbol * p) { if (ch < 0x80) { p[0] = ch; return 1; } if (ch < 0x800) { p[0] = (ch >> 6) | 0xC0; p[1] = (ch & 0x3F) | 0x80; return 2; } p[0] = (ch >> 12) | 0xE0; p[1] = ((ch >> 6) & 0x3F) | 0x80; p[2] = (ch & 0x3F) | 0x80; return 3; } snowball-3.1.0/compiler/tokeniser.c000066400000000000000000000571341520373054300173310ustar00rootroot00000000000000 #include /* stderr etc */ #include /* malloc free */ #include /* strlen */ #include /* isalpha etc */ #include "header.h" #include "tokens.h" #define INITIAL_INPUT_BUFFER_SIZE 8192 static int hex_to_num(int ch); extern byte * get_input(const char * filename) { FILE * input = strcmp(filename, "-") == 0 ? stdin : fopen(filename, "rb"); if (input == NULL) { return NULL; } byte * u = NULL; int size = fseek(input, 0, SEEK_END) == 0 ? ftell(input) : -1; if (size >= 0 && fseek(input, 0, SEEK_SET) == 0) { u = create_s(size); if (fread(u, size, 1, input) != 1) { fprintf(stderr, "%s: Read error\n", filename); exit(1); } } else { // Unseekable stream, e.g. piped stdin. size = 0; u = create_s(INITIAL_INPUT_BUFFER_SIZE); while (true) { int s = CAPACITY(u) - size; int r = fread(u + size, 1, s, input); if (r < 0) { fprintf(stderr, "%s: Read error\n", filename); exit(1); } size += r; if (r < s) break; u = increase_capacity_s(u, size); } } if (input != stdin) fclose(input); SET_SIZE(u, size); return u; } static void error(struct tokeniser * t, const char * s1, byte * p, int n, const char * s2) { if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); } fprintf(stderr, "%s:%d: ", t->file, t->line_number); if (s1) fprintf(stderr, "%s", s1); if (p) { int i; for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]); } if (s2) fprintf(stderr, "%s", s2); fprintf(stderr, "\n"); t->error_count++; } static void error1(struct tokeniser * t, const char * s) { error(t, s, NULL, 0, NULL); } static void error2(struct tokeniser * t, const char * s) { error(t, "unexpected end of text after ", NULL, 0, s); } static int compare_words(int m, const byte * p, int n, const byte * q) { if (m != n) return m - n; return memcmp(p, q, n); } static int find_word(int n, const byte * p) { int i = -1; int j = NUM_ALPHA_TOKENS; do { int k = i + (j - i) / 2; const struct token * w = alpha_tokens + k; int diff = compare_words(n, p, w->s_size, (const byte*)w->s); if (diff == 0) return w->code; if (diff < 0) j = k; else i = k; } while (j - i != 1); return -1; } static int white_space(struct tokeniser * t, int ch) { switch (ch) { case '\n': t->line_number++; /* fall through */ case '\r': case '\t': case ' ': return true; } return false; } static symbol * find_in_m(struct tokeniser * t, int n, byte * p) { struct m_pair * q; for (q = t->m_pairs; q; q = q->next) { byte * name = q->name; if (n == SIZE(name) && memcmp(name, p, n) == 0) return q->value; } return NULL; } static int read_literal_string(struct tokeniser * t, int c) { byte * p = t->p; SET_SIZE(t->b, 0); while (true) { if (c >= SIZE(p) || p[c] == '\n') { error1(t, "string literal not terminated"); return c; } int ch = p[c]; c++; if (ch == t->m_start) { /* Inside insert characters. */ int c0 = c; bool newlines = false; /* no newlines as yet */ bool all_whitespace = true; /* no printing chars as yet */ while (true) { if (c >= SIZE(p) || (p[c] == '\n' && !all_whitespace)) { error1(t, "string literal not terminated"); return c; } ch = p[c]; if (ch == '\n') { newlines = true; } c++; if (ch == t->m_end) break; if (!white_space(t, ch)) all_whitespace = false; } if (!newlines) { int n = c - c0 - 1; /* macro size */ int firstch = p[c0]; symbol * q = find_in_m(t, n, p + c0); if (q == NULL) { if (n == 1 && (firstch == '\'' || firstch == t->m_start)) t->b = add_symbol_to_b(t->b, p[c0]); else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') { int codepoint = 0; if (t->uplusmode == UPLUS_DEFINED) { /* See if found with xxxx upper-cased. */ byte * uc = create_s(n); int i; for (i = 0; i != n; ++i) { uc[i] = toupper(p[c0 + i]); } q = find_in_m(t, n, uc); lose_s(uc); if (q != NULL) { t->b = add_to_b(t->b, q, SIZE(q)); continue; } error1(t, "Some U+xxxx stringdefs seen but not this one"); } else { t->uplusmode = UPLUS_UNICODE; } for (int x = c0 + 2; x != c - 1; ++x) { int hex = hex_to_num(p[x]); if (hex < 0) { error1(t, "Bad hex digit following U+"); break; } codepoint = (codepoint << 4) | hex; } if (t->encoding == ENC_UTF8) { if (codepoint < 0 || codepoint > 0x01ffff) { error1(t, "character values exceed 0x01ffff"); } /* Ensure there's enough space for a max length * UTF-8 sequence. */ int b_size = SIZE(t->b); if (CAPACITY(t->b) < b_size + 3) { t->b = increase_capacity_b(t->b, 3); } SET_SIZE(t->b, b_size + put_utf8(codepoint, t->b + b_size)); } else { if (t->encoding == ENC_SINGLEBYTE) { /* Only ISO-8859-1 is handled this way - for * other single-byte character sets you need * to stringdef all the U+xxxx codes you use * like - e.g.: * * stringdef U+0171 hex 'FB' */ if (codepoint < 0 || codepoint > 0xff) { error1(t, "character values exceed 256"); } } else { if (codepoint < 0 || codepoint > 0xffff) { error1(t, "character values exceed 64K"); } } t->b = add_symbol_to_b(t->b, (symbol)codepoint); } } else { error(t, "string macro '", p + c0, n, "' undeclared"); } } else { t->b = add_to_b(t->b, q, SIZE(q)); } } } else { if (ch == '\'') return c; if (ch < 0 || ch >= 0x80) { if (t->encoding != ENC_WIDECHARS) { /* We don't really want people using non-ASCII literal * strings, but historically it's worked for single-byte * and UTF-8 if the source encoding matches what the * generated stemmer works in and it seems unfair to just * suddenly make this a hard error. */ fprintf(stderr, "%s:%d: warning: Non-ASCII literal strings aren't " "portable - use stringdef instead\n", t->file, t->line_number); } else { error1(t, "Non-ASCII literal strings aren't " "portable - use stringdef instead"); } } t->b = add_symbol_to_b(t->b, p[c - 1]); } } } static int next_token(struct tokeniser * t) { byte * p = t->p; int c = t->c; int code = -1; while (true) { if (c >= SIZE(p)) { t->c = c; return -1; } int ch = p[c]; if (white_space(t, ch)) { c++; continue; } if (isalpha(ch)) { int c0 = c; while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++; t->c = c; code = find_word(c - c0, p + c0); if (code >= 0 && !t->token_disabled[code]) { return code; } SET_SIZE(t->s, 0); t->s = add_slen_to_s(t->s, (const char*)p + c0, c - c0); return c_name; } if (isdigit(ch)) { int value = ch - '0'; while (++c < SIZE(p) && isdigit(p[c])) { value = 10 * value + (p[c] - '0'); } t->c = c; t->number = value; return c_number; } byte ch1 = p[c]; byte ch2 = (SIZE(p) - c > 1) ? p[c + 1] : 0; t->c = c + 2; switch (ch1) { case '\'': t->c = read_literal_string(t, c + 1); return c_literalstring; case '!': if (ch2 == '=') return c_ne; // != break; case '$': --t->c; return c_dollar; // $ case '(': --t->c; return c_bra; // ( case ')': --t->c; return c_ket; // ) case '*': if (ch2 == '=') return c_multiplyassign; // *= --t->c; return c_multiply; // * case '+': if (ch2 == '=') return c_plusassign; // += --t->c; return c_plus; // + case '-': if (ch2 == '=') return c_minusassign; // -= if (ch2 == '>') return c_sliceto; // -> --t->c; return c_minus; // - case '/': if (ch2 == '/') return c_comment1; // // if (ch2 == '*') return c_comment2; // /* if (ch2 == '=') return c_divideassign; // /= --t->c; return c_divide; // / case '<': if (ch2 == '-') return c_slicefrom; // <- if (ch2 == '=') return c_le; // <= if (ch2 == '+') { // <+ fprintf(stderr, "%s:%d: warning: `<+` is a legacy feature - " "use `insert` instead\n", t->file, t->line_number); return c_insert; } --t->c; return c_lt; // < case '=': if (ch2 == '=') return c_eq; // == if (ch2 == '>') return c_assignto; // => --t->c; return c_assign; // = case '>': if (ch2 == '=') return c_ge; // >= --t->c; return c_gt; // > case '?': --t->c; return c_debug; // ? case '[': --t->c; return c_leftslice; // [ case ']': --t->c; return c_rightslice; // ] } error(t, "'", p + c, 1, "' unknown"); c++; continue; } } static int next_char(struct tokeniser * t) { if (t->c >= SIZE(t->p)) return -1; return t->p[t->c++]; } static int next_real_char(struct tokeniser * t) { while (true) { int ch = next_char(t); if (!white_space(t, ch)) return ch; } } static void read_chars(struct tokeniser * t) { int ch = next_real_char(t); if (ch < 0) { error2(t, "stringdef"); return; } int c0 = t->c-1; while (true) { ch = next_char(t); if (white_space(t, ch) || ch < 0) break; } SET_SIZE(t->s, 0); t->s = add_slen_to_s(t->s, (const char*)t->p + c0, t->c - c0 - 1); } static int decimal_to_num(int ch) { if ('0' <= ch && ch <= '9') return ch - '0'; return -1; } static int hex_to_num(int ch) { if ('0' <= ch && ch <= '9') return ch - '0'; if ('a' <= ch && ch <= 'f') return ch - 'a' + 10; if ('A' <= ch && ch <= 'F') return ch - 'A' + 10; return -1; } static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { int c = 0; int d = 0; while (true) { while (c < SIZE(p) && p[c] == ' ') c++; if (c == SIZE(p)) break; int number = 0; while (c != SIZE(p)) { int ch = p[c]; if (ch == ' ') break; if (base == 10) { ch = decimal_to_num(ch); if (ch < 0) { error1(t, "decimal string contains non-digits"); return; } } else { ch = hex_to_num(ch); if (ch < 0) { error1(t, "hex string contains non-hex characters"); return; } } number = base * number + ch; c++; } if (t->encoding == ENC_SINGLEBYTE) { if (number < 0 || number > 0xff) { error1(t, "character values exceed 256"); return; } } else { if (number < 0 || number > 0xffff) { error1(t, "character values exceed 64K"); return; } } if (t->encoding == ENC_UTF8) d += put_utf8(number, p + d); else p[d++] = number; } SET_SIZE(p, d); } extern int read_token(struct tokeniser * t) { if (t->token_held) { t->token_held = false; return t->token; } t->token_reported_as_unexpected = false; byte * p = t->p; while (true) { int code = next_token(t); switch (code) { case c_comment1: /* slash-slash comment */ while (t->c < SIZE(p) && p[t->c] != '\n') t->c++; continue; case c_comment2: { /* slash-star comment */ // Scan for a '*' stopping one before the end since we need a // '/' to follow it to close the comment. int size_less_one = SIZE(p) - 1; int c = t->c; while (true) { if (c >= size_less_one) { error1(t, "/* comment not terminated"); t->token = -1; return -1; } if (p[c] == '\n') { t->line_number++; } else if (p[c] == '*' && p[c + 1] == '/') { // Found '*/' to end of comment. t->c = c + 2; break; } ++c; } continue; } case c_stringescapes: { int ch1 = next_real_char(t); int ch2 = next_real_char(t); if (ch2 < 0) { error2(t, "stringescapes"); continue; } if (ch1 == '\'') { error1(t, "first stringescape cannot be '"); continue; } t->m_start = ch1; t->m_end = ch2; continue; } case c_stringdef: { int base = 0; read_chars(t); code = read_token(t); if (code == c_hex) { // We use `hex` to define U+xxxx stringdefs for single-byte // character sets so suppress the warning there, e.g.: // // stringdef U+02D9 hex 'FF' if (!(t->s[0] == 'U' && t->s[1] == '+')) { fprintf(stderr, "%s:%d: warning: `hex` is a legacy feature - " "use {U+1234} notation instead\n", t->file, t->line_number); } base = 16; code = read_token(t); } else if (code == c_decimal) { fprintf(stderr, "%s:%d: warning: `decimal` is a legacy feature - " "use {U+1234} notation instead\n", t->file, t->line_number); base = 10; code = read_token(t); } if (code != c_literalstring) { error1(t, "string omitted after stringdef"); continue; } if (base > 0) convert_numeric_string(t, t->b, base); NEW(m_pair, q); q->next = t->m_pairs; q->name = copy_s(t->s); q->value = copy_b(t->b); t->m_pairs = q; if (t->uplusmode != UPLUS_DEFINED && (SIZE(t->s) >= 3 && t->s[0] == 'U' && t->s[1] == '+')) { if (t->uplusmode == UPLUS_UNICODE) { error1(t, "U+xxxx already used with implicit meaning"); } else { t->uplusmode = UPLUS_DEFINED; } } continue; } case c_get: { code = read_token(t); if (code != c_literalstring) { error1(t, "string omitted after get"); continue; } t->get_depth++; if (t->get_depth > 10) { error1(t, "get directives go 10 deep. Looping?"); exit(1); } NEW(input, q); char * file = b_to_sz(t->b); int file_owned = 1; byte * u = get_input(file); if (u == NULL) { struct include * r; for (r = t->includes; r; r = r->next) { byte * s = copy_s(r->s); s = add_sz_to_s(s, file); s[SIZE(s)] = 0; if (file_owned > 0) { free(file); } else { lose_s((byte *)file); } file = (char*)s; file_owned = -1; u = get_input(file); if (u != NULL) break; } } if (u == NULL) { error(t, "Can't get '", (byte *)file, strlen(file), "'"); exit(1); } memmove(q, t, sizeof(struct input)); t->next = q; t->p = u; t->c = 0; t->file = file; t->file_owned = file_owned; t->line_number = 1; p = t->p; continue; } case -1: if (t->next) { lose_s(p); struct input * q = t->next; memmove(t, q, sizeof(struct input)); p = t->p; FREE(q); t->get_depth--; continue; } /* fall through */ default: t->previous_token = t->token; t->token = code; return code; } } } extern int peek_token(struct tokeniser * t) { int token = read_token(t); t->token_held = true; return token; } extern const char * name_of_token(int code) { for (int i = 0; i < NUM_ALPHA_TOKENS; i++) { if (alpha_tokens[i].code == code) return alpha_tokens[i].s; } switch (code) { case c_dollar: return "$"; case c_bra: return "("; case c_ket: return ")"; case c_multiply: return "*"; case c_plus: return "+"; case c_minus: return "-"; case c_divide: return "/"; case c_lt: return "<"; case c_stringassign: case c_assign: return "="; case c_gt: return ">"; case c_debug: return "?"; case c_leftslice: return "["; case c_rightslice: return "]"; case c_ne: return "!="; case c_multiplyassign: return "*="; case c_plusassign: return "+="; case c_minusassign: return "-="; case c_sliceto: return "->"; case c_comment2: return "/*"; case c_comment1: return "//"; case c_divideassign: return "/="; case c_insert: return "<+"; case c_slicefrom: return "<-"; case c_le: return "<="; case c_eq: return "=="; case c_assignto: return "=>"; case c_ge: return ">="; case c_name: return "name"; case c_number: return "number"; case c_literalstring: return "literal"; case c_neg: return "neg"; case c_grouping: return "grouping"; case c_call: return "call"; case c_booltest: return "Boolean test"; case c_functionend: return "Function end"; case c_goto_grouping: return "goto grouping"; case c_gopast_grouping: return "gopast grouping"; case c_goto_non: return "goto non"; case c_gopast_non: return "gopast non"; case c_not_booltest: return "Inverted boolean test"; case -2: return "start of text"; case -1: return "end of text"; default: return "?"; } } extern void disable_token(struct tokeniser * t, int code) { t->token_disabled[code] = 1; } extern struct tokeniser * create_tokeniser(byte * p, char * file) { NEW(tokeniser, t); *t = (struct tokeniser){0}; t->p = p; t->file = file; t->line_number = 1; t->b = create_b(0); t->s = create_s(0); t->m_start = -1; t->token = -2; t->previous_token = -2; t->uplusmode = UPLUS_NONE; return t; } extern void close_tokeniser(struct tokeniser * t) { lose_b(t->b); lose_s(t->s); { struct m_pair * q = t->m_pairs; while (q) { struct m_pair * q_next = q->next; lose_s(q->name); lose_b(q->value); FREE(q); q = q_next; } } { struct input * q = t->next; while (q) { struct input * q_next = q->next; FREE(q); q = q_next; } } if (t->file_owned > 0) { free(t->file); } else if (t->file_owned < 0) { lose_s((byte *)t->file); } FREE(t); } snowball-3.1.0/compiler/tokens.h000066400000000000000000000047751520373054300166410ustar00rootroot00000000000000struct token { byte code; /* Token code (from enum token_code) */ byte s_size; /* Size of token */ const char s[14]; /* Token */ }; /* List of alphabetical tokens and their corresponding codes (symbol tokens are * instead handled using `switch` in tokenise.c). * * Tokens below are ordered primarily by length, then by ASCII collating * order amongst tokens of the same length. */ static const struct token alpha_tokens[] = { { c_as, 2, "as" }, { c_do, 2, "do" }, { c_or, 2, "or" }, { c_and, 3, "and" }, { c_for, 3, "for" }, { c_get, 3, "get" }, { c_hex, 3, "hex" }, { c_hop, 3, "hop" }, { c_len, 3, "len" }, { c_non, 3, "non" }, { c_not, 3, "not" }, { c_set, 3, "set" }, { c_try, 3, "try" }, { c_fail, 4, "fail" }, { c_goto, 4, "goto" }, { c_loop, 4, "loop" }, { c_next, 4, "next" }, { c_size, 4, "size" }, { c_test, 4, "test" }, { c_true, 4, "true" }, { c_among, 5, "among" }, { c_false, 5, "false" }, { c_lenof, 5, "lenof" }, { c_limit, 5, "limit" }, { c_unset, 5, "unset" }, { c_atmark, 6, "atmark" }, { c_attach, 6, "attach" }, { c_cursor, 6, "cursor" }, { c_define, 6, "define" }, { c_delete, 6, "delete" }, { c_gopast, 6, "gopast" }, { c_insert, 6, "insert" }, { c_maxint, 6, "maxint" }, { c_minint, 6, "minint" }, { c_repeat, 6, "repeat" }, { c_sizeof, 6, "sizeof" }, { c_tomark, 6, "tomark" }, { c_atleast, 7, "atleast" }, { c_atlimit, 7, "atlimit" }, { c_decimal, 7, "decimal" }, { c_reverse, 7, "reverse" }, { c_setmark, 7, "setmark" }, { c_strings, 7, "strings" }, { c_tolimit, 7, "tolimit" }, { c_booleans, 8, "booleans" }, { c_integers, 8, "integers" }, { c_routines, 8, "routines" }, { c_setlimit, 8, "setlimit" }, { c_backwards, 9, "backwards" }, { c_externals, 9, "externals" }, { c_groupings, 9, "groupings" }, { c_stringdef, 9, "stringdef" }, { c_substring, 9, "substring" }, { c_backwardmode, 12, "backwardmode" }, { c_stringescapes, 13, "stringescapes" } }; #define NUM_ALPHA_TOKENS ((int)(sizeof(alpha_tokens) / sizeof(alpha_tokens[0]))) snowball-3.1.0/csharp/000077500000000000000000000000001520373054300146165ustar00rootroot00000000000000snowball-3.1.0/csharp/.gitignore000066400000000000000000000001061520373054300166030ustar00rootroot00000000000000*.o *.suo *.user *.GhostDoc.xml bin/ obj/ TestResults/ TestResult.xml snowball-3.1.0/csharp/Snowball/000077500000000000000000000000001520373054300163775ustar00rootroot00000000000000snowball-3.1.0/csharp/Snowball/Algorithms/000077500000000000000000000000001520373054300205105ustar00rootroot00000000000000snowball-3.1.0/csharp/Snowball/Algorithms/.gitignore000066400000000000000000000000171520373054300224760ustar00rootroot00000000000000*.generated.cs snowball-3.1.0/csharp/Snowball/Among.cs000066400000000000000000000067651520373054300200050ustar00rootroot00000000000000// Copyright (c) 2001, Dr Martin Porter // Copyright (c) 2002, Richard Boulton // Copyright (c) 2015, Cesar Souza // Copyright (c) 2025, Olly Betts // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // 3. Neither the name of the Snowball project nor the names of its contributors // may be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace Snowball { using System; using System.Text; /// /// Snowball's among construction. /// /// public sealed class Among { /// /// Search string. /// /// public string SearchString { get; private set; } /// /// Index to longest matching substring. /// /// public int MatchIndex { get; private set; } /// /// Result of the lookup. /// /// public int Result { get; private set; } /// /// Condition function index (or 0 for no condition). /// /// public int Condition { get; private set; } /// /// Initializes a new instance of the class. /// /// /// The search string. /// The index to the longest matching substring. /// The result of the lookup. /// The index of the condition function (0 if none). /// public Among(String str, int index, int result, int condition) { this.SearchString = str; this.MatchIndex = index; this.Result = result; this.Condition = condition; } /// /// Returns a that represents this instance. /// /// /// /// A that represents this instance. /// /// public override string ToString() { return SearchString; } } } snowball-3.1.0/csharp/Snowball/AssemblyInfo.cs000066400000000000000000000026271520373054300213300ustar00rootroot00000000000000using System.Reflection; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; // General Information about an assembly is controlled through the following // set of attributes. Change these attribute values to modify the information // associated with an assembly. [assembly: AssemblyTitle("Snowball")] [assembly: AssemblyDescription("")] [assembly: AssemblyConfiguration("")] [assembly: AssemblyCompany("")] [assembly: AssemblyProduct("Snowball")] [assembly: AssemblyCopyright("Copyright © 2015-2025")] [assembly: AssemblyTrademark("")] [assembly: AssemblyCulture("")] // Setting ComVisible to false makes the types in this assembly not visible // to COM components. If you need to access a type in this assembly from // COM, set the ComVisible attribute to true on that type. [assembly: ComVisible(false)] // The following GUID is for the ID of the typelib if this project is exposed to COM [assembly: Guid("5c54ebc8-a3a3-46f8-b732-60b1440c8b0b")] // Version information for an assembly consists of the following four values: // // Major Version // Minor Version // Build Number // Revision // // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] [assembly: AssemblyVersion(/*SNOWBALL_VERSION*/"3.1.0.0")] [assembly: AssemblyFileVersion(/*SNOWBALL_VERSION*/"3.1.0.0")] snowball-3.1.0/csharp/Snowball/Stemmer.cs000066400000000000000000000436701520373054300203540ustar00rootroot00000000000000// Copyright (c) 2001, Dr Martin Porter // Copyright (c) 2002, Richard Boulton // Copyright (c) 2015, Cesar Souza // Copyright (c) 2018-2026, Olly Betts // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // 3. Neither the name of the Snowball project nor the names of its contributors // may be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace Snowball { using System; using System.Diagnostics; using System.Linq; using System.Text; /// /// Class holding current state. /// /// public class Env { /// /// Initializes a new instance of the class. /// /// protected Env() { } /// /// The current string. /// /// protected StringBuilder current; /// /// Current cursor position. /// /// protected int cursor; /// /// Forward limit for inspecting the buffer. /// /// protected int limit; /// /// Backward limit for inspecting the buffer. /// /// protected int limit_backward; /// /// Start of the slice. /// /// protected int bra; /// /// End of the slice. /// /// protected int ket; /// /// Copy another Env object. /// /// public Env(Env other) { copy_from(other); } /// /// Copy another Env object. /// /// protected void copy_from(Env other) { current = other.current; cursor = other.cursor; limit = other.limit; limit_backward = other.limit_backward; bra = other.bra; ket = other.ket; } } /// /// Base class for Snowball's stemmer algorithms. /// /// public abstract class Stemmer : Env { /// /// Among function being called. /// /// protected int af; /// /// Initializes a new instance of the class. /// /// protected Stemmer() { current = new StringBuilder(); setBufferContents(""); } /// /// Calls the stemmer to process the next word. /// /// protected abstract bool stem(); /// /// Stems the buffer's contents. /// /// public bool Stem() { return this.stem(); } /// /// Stems a given word. /// /// /// The word to be stemmed. /// /// The stemmed word. /// public string Stem(string word) { setBufferContents(word); this.stem(); return current.ToString(); } /// /// Gets the current processing buffer. /// /// public StringBuilder Buffer { get { return current; } } /// /// Gets or sets the current word to be stemmed /// or the stemmed word, if the stemmer has been /// processed. /// /// public string Current { get { return current.ToString(); } set { setBufferContents(value); } } private void setBufferContents(string value) { current.Clear(); current.Insert(0, value); cursor = 0; limit = current.Length; limit_backward = 0; bra = cursor; ket = limit; } /// /// Determines whether the current character is /// inside a given group of characters s. /// protected int in_grouping(string s, int min, int max, bool repeat) { do { if (cursor >= limit) return -1; char ch = current[cursor]; if (ch > max || ch < min) return 1; if (!s.Contains(ch)) return 1; cursor++; } while (repeat); return 0; } /// /// Determines whether the current character is /// inside a given group of characters s. /// protected int in_grouping_b(string s, int min, int max, bool repeat) { do { if (cursor <= limit_backward) return -1; char ch = current[cursor - 1]; if (ch > max || ch < min) return 1; if (!s.Contains(ch)) return 1; cursor--; } while (repeat); return 0; } /// /// Determines whether the current character is /// outside a given group of characters s. /// protected int out_grouping(string s, int min, int max, bool repeat) { do { if (cursor >= limit) return -1; char ch = current[cursor]; if (ch > max || ch < min) { cursor++; continue; } if (!s.Contains(ch)) { cursor++; continue; } return 1; } while (repeat); return 0; } /// /// Determines whether the current character is /// outside a given group of characters s. /// protected int out_grouping_b(string s, int min, int max, bool repeat) { do { if (cursor <= limit_backward) return -1; char ch = current[cursor - 1]; if (ch > max || ch < min) { cursor--; continue; } if (!s.Contains(ch)) { cursor--; continue; } return 1; } while (repeat); return 0; } /// /// Determines if the current buffer contains the /// string s, starting from the current position and /// going forward. /// protected bool eq_s(String s) { if (limit - cursor < s.Length) return false; for (int i = 0; i != s.Length; i++) { if (current[cursor + i] != s[i]) return false; } cursor += s.Length; return true; } /// /// Determines if the current buffer contains the /// string s, starting from the current position and /// going forward. /// protected bool eq_s(StringBuilder s) { if (limit - cursor < s.Length) return false; for (int i = 0; i != s.Length; i++) { if (current[cursor + i] != s[i]) return false; } cursor += s.Length; return true; } /// /// Determines if the current buffer contains the /// string s, starting from the current position and /// going backwards. /// protected bool eq_s_b(String s) { if (cursor - limit_backward < s.Length) return false; for (int i = 0; i != s.Length; i++) { if (current[cursor - s.Length + i] != s[i]) return false; } cursor -= s.Length; return true; } /// /// Determines if the current buffer contains the /// string s, starting from the current position and /// going backwards. /// protected bool eq_s_b(StringBuilder s) { if (cursor - limit_backward < s.Length) return false; for (int i = 0; i != s.Length; i++) { if (current[cursor - s.Length + i] != s[i]) return false; } cursor -= s.Length; return true; } /// /// Searches if the current buffer matches against one of the /// amongs, starting from the current cursor position and going /// forward. /// /// protected int find_among(Among[] v, Func call_among_func) { int i = 0; int j = v.Length; int c = cursor; int l = limit; int common_i = 0; int common_j = 0; bool first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; // smaller Among w = v[k]; for (int i2 = common; i2 < w.SearchString.Length; i2++) { if (c + common == l) { diff = -1; break; } diff = current[c + common] - w.SearchString[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j == i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.SearchString.Length) { cursor = c + w.SearchString.Length; if (w.Condition == 0) return w.Result; af = w.Condition; if (call_among_func()) { cursor = c + w.SearchString.Length; return w.Result; } } i = w.MatchIndex; if (i < 0) return 0; } } /// /// Searches if the current buffer matches against one of the /// amongs, starting from the current cursor position and going /// backwards. /// /// protected int find_among_b(Among[] v, Func call_among_func) { int i = 0; int j = v.Length; int c = cursor; int lb = limit_backward; int common_i = 0; int common_j = 0; bool first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; Among w = v[k]; for (int i2 = w.SearchString.Length - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = current[c - 1 - common] - w.SearchString[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.SearchString.Length) { cursor = c - w.SearchString.Length; if (w.Condition == 0) return w.Result; af = w.Condition; if (call_among_func()) { cursor = c - w.SearchString.Length; return w.Result; } } i = w.MatchIndex; if (i < 0) return 0; } } /// /// Replaces the characters between c_bra /// and c_ket by the characters in s. /// /// protected int replace_s(int c_bra, int c_ket, String s) { int adjustment = s.Length - (c_ket - c_bra); Replace(current, c_bra, c_ket, s); limit += adjustment; if (cursor >= c_ket) cursor += adjustment; else if (cursor > c_bra) cursor = c_bra; return adjustment; } /// /// Checks if a slicing can be done. /// protected void slice_check() { Debug.Assert(bra >= 0); Debug.Assert(bra <= ket); Debug.Assert(ket <= limit); Debug.Assert(limit <= current.Length); } /// /// Replaces the contents of the slice with the string s. /// /// /// The string. protected void slice_from(String s) { slice_check(); replace_s(bra, ket, s); ket = bra + s.Length; } /// /// Removes the current slice contents. /// /// protected void slice_del() { slice_from(""); } /// /// Replaces the contents of the bracket with the string s. /// /// protected void insert(int c_bra, int c_ket, String s) { int adjustment = replace_s(c_bra, c_ket, s); if (c_bra <= bra) bra += adjustment; if (c_bra <= ket) ket += adjustment; } /// /// Replaces the contents of s with the slice. /// /// protected void slice_to(StringBuilder s) { slice_check(); Replace(s, 0, s.Length, current.ToString(bra, ket - bra)); } /// /// Replaces the contents of the bracket with the string s. /// /// protected void assign_to(StringBuilder s) { Replace(s, 0, s.Length, current.ToString(0, limit)); } /// /// Replaces a specific region of the buffer with another text. /// public static void Replace(StringBuilder sb, int index, int length, string text) { sb.Remove(index, length - index); sb.Insert(index, text); } } } snowball-3.1.0/csharp/Stemwords/000077500000000000000000000000001520373054300166055ustar00rootroot00000000000000snowball-3.1.0/csharp/Stemwords/App.config000066400000000000000000000002661520373054300205200ustar00rootroot00000000000000 snowball-3.1.0/csharp/Stemwords/Program.cs000066400000000000000000000105211520373054300205420ustar00rootroot00000000000000// Copyright (c) 2001, Dr Martin Porter // Copyright (c) 2002, Richard Boulton // Copyright (c) 2015, Cesar Souza // Copyright (c) 2025, Olly Betts // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // 3. Neither the name of the Snowball project nor the names of its contributors // may be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace Snowball { using System; using System.IO; using System.Reflection; using System.Linq; /// /// Snowball's Stemmer program. /// /// public static class Program { private static void usage() { Console.WriteLine("Usage: stemwords.exe -l [-i ] [-o ]"); } /// /// Main program entrypoint. /// /// public static void Main(String[] args) { string language = null; string inputName = null; string outputName = null; for (int i = 0; i < args.Length; i++) { if (args[i] == "-l") language = args[i + 1]; else if (args[i] == "-i") inputName = args[i + 1]; else if (args[i] == "-o") outputName = args[i + 1]; } if (language == null) { usage(); return; } Stemmer stemmer = typeof(Stemmer).Assembly.GetTypes() .Where(t => t.IsSubclassOf(typeof(Stemmer)) && !t.IsAbstract) .Where(t => match(t.Name, language)) .Select(t => (Stemmer)Activator.CreateInstance(t)).FirstOrDefault(); if (stemmer == null) { Console.WriteLine("Language not found."); return; } TextReader input = System.Console.In; if (inputName != null) input = new StreamReader(inputName); TextWriter output; if (outputName != null) { output = new StreamWriter(outputName); } else { // For some reason this is much faster than using // System.Console.Out, at least with mono on Linux. // // `make check_sharp_tamil` takes 0.842s wallclock instead of // 1.848s. output = new StreamWriter(Console.OpenStandardOutput()); } while (true) { var line = input.ReadLine(); if (line == null) break; var o = stemmer.Stem(line); output.WriteLine(o); } output.Flush(); } private static bool match(string stemmerName, string language) { string expectedName = language + "Stemmer"; return stemmerName.StartsWith(expectedName, StringComparison.CurrentCultureIgnoreCase); } } } snowball-3.1.0/dart/000077500000000000000000000000001520373054300142705ustar00rootroot00000000000000snowball-3.1.0/dart/.gitignore000066400000000000000000000000451520373054300162570ustar00rootroot00000000000000.dart_tool/ pubspec.lock .dart_deps snowball-3.1.0/dart/analysis_options.yaml000066400000000000000000000000501520373054300205450ustar00rootroot00000000000000include: package:lints/recommended.yaml snowball-3.1.0/dart/example/000077500000000000000000000000001520373054300157235ustar00rootroot00000000000000snowball-3.1.0/dart/example/test_app.dart000066400000000000000000000027311520373054300204210ustar00rootroot00000000000000import 'dart:convert'; import 'dart:io'; import 'package:snowball/snowball.dart'; void usage() { print("Usage: test_app [] [-o ]"); } Future main(List args) async { if (args.isEmpty) { usage(); exit(1); } final algorithms = Algorithm.values.where((v) => v.name == args[0]); if (algorithms.isEmpty) { print('Stemmer for ${args[0]} not found'); exit(1); } final algorithm = algorithms.first; final stemmer = SnowballStemmer(algorithm); int arg = 1; Stream> inStream; if (args.length > arg && args[arg] != '-o') { inStream = File(args[arg++]).openRead(); } else { inStream = stdin; } IOSink outStream; if (args.length > arg) { if (args.length != arg + 2 || args[arg] != '-o') { usage(); exit(1); } outStream = File(args[arg + 1]).openWrite(); } else { outStream = stdout.nonBlocking; } Stream reader = inStream .transform(utf8.decoder) .transform(LineSplitter()); final outBuffer = StringBuffer(); await for (var line in reader) { try { final stem = stemmer.stem(line); outBuffer.writeln(stem); if (outBuffer.length > 8192) { outStream.write(outBuffer); outBuffer.clear(); } } catch (e) { print('Failed to stem word "$line"'); rethrow; } } if (outBuffer.isNotEmpty) outStream.write(outBuffer); await outStream.flush(); await outStream.close(); } snowball-3.1.0/dart/generate_algorithms.pl000077500000000000000000000011731520373054300206550ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; my @algorithms = @ARGV; # Generate the lib/src/algorithms.dart file from modules.txt print("// ignore_for_file: constant_identifier_names\n\n"); print("import 'package:snowball/src/snowball.dart';\n\n"); foreach my $algorithm (@algorithms) { print("import '../ext/${algorithm}_stemmer.dart';\n"); } print("\nenum Algorithm {\n"); foreach my $algorithm (@algorithms) { print(" $algorithm,\n"); } print("}\n\nfinal stemmers = {\n"); foreach my $algorithm (@algorithms) { print(" Algorithm.$algorithm: ${algorithm}_stemmer(),\n"); } print("};\n"); snowball-3.1.0/dart/lib/000077500000000000000000000000001520373054300150365ustar00rootroot00000000000000snowball-3.1.0/dart/lib/snowball.dart000066400000000000000000000006711520373054300175370ustar00rootroot00000000000000// ignore_for_file: non_constant_identifier_names, curly_braces_in_flow_control_structures library; import 'src/algorithms.dart'; export 'src/algorithms.dart'; import 'src/snowball.dart' as impl; class SnowballStemmer { final impl.SnowballStemmer _stemmer; SnowballStemmer(Algorithm algorithm) : _stemmer = stemmers[algorithm]!; String stem(String s) { _stemmer.init(s); _stemmer.stem(); return _stemmer.current; } } snowball-3.1.0/dart/lib/src/000077500000000000000000000000001520373054300156255ustar00rootroot00000000000000snowball-3.1.0/dart/lib/src/snowball.dart000066400000000000000000000200601520373054300203200ustar00rootroot00000000000000// ignore_for_file: non_constant_identifier_names, curly_braces_in_flow_control_structures library; /// Internal class used by Snowball stemmers class Among { Among(this.s, this.substring_i, this.result, [this.function_index = 0]); /// search string final String s; /// index to longest matching substring final int substring_i; /// result of the lookup final int result; /// function index, 0 if none final int function_index; } class SnowballProgram { String current = ''; int cursor = 0; int limit = 0; int limit_backward = 0; int bra = 0; int ket = 0; int af = 0; SnowballProgram(); SnowballProgram.from(SnowballProgram other) : current = other.current, cursor = other.cursor, limit = other.limit, limit_backward = other.limit_backward, bra = other.bra, ket = other.ket, af = other.af; void init(String s) { current = s; cursor = 0; limit = current.length; limit_backward = 0; bra = cursor; ket = limit; } void copy_from(SnowballProgram other) { current = other.current; cursor = other.cursor; limit = other.limit; limit_backward = other.limit_backward; bra = other.bra; ket = other.ket; } bool in_grouping(String s, int min, int max) { if (cursor >= limit) return false; int ch = current.codeUnitAt(cursor); if (ch > max || ch < min) return false; ch -= min; if ((s.codeUnitAt(ch >> 3) & (0X1 << (ch & 0X7))) == 0) return false; cursor++; return true; } bool go_in_grouping(String s, int min, int max) { while (cursor < limit) { int ch = current.codeUnitAt(cursor); if (ch > max || ch < min) return true; ch -= min; if ((s.codeUnitAt(ch >> 3) & (0X1 << (ch & 0X7))) == 0) return true; cursor++; } return false; } bool in_grouping_b(String s, int min, int max) { if (cursor <= limit_backward) return false; int ch = current.codeUnitAt(cursor - 1); if (ch > max || ch < min) return false; ch -= min; if ((s.codeUnitAt(ch >> 3) & (0X1 << (ch & 0X7))) == 0) return false; cursor--; return true; } bool go_in_grouping_b(String s, int min, int max) { while (cursor > limit_backward) { int ch = current.codeUnitAt(cursor - 1); if (ch > max || ch < min) return true; ch -= min; if ((s.codeUnitAt(ch >> 3) & (0X1 << (ch & 0X7))) == 0) return true; cursor--; } return false; } bool out_grouping(String s, int min, int max) { if (cursor >= limit) return false; int ch = current.codeUnitAt(cursor); if (ch > max || ch < min) { cursor++; return true; } ch -= min; if ((s.codeUnitAt(ch >> 3) & (0X1 << (ch & 0X7))) == 0) { cursor++; return true; } return false; } bool go_out_grouping(String s, int min, int max) { while (cursor < limit) { int ch = current.codeUnitAt(cursor); if (ch <= max && ch >= min) { ch -= min; if ((s.codeUnitAt(ch >> 3) & (0X1 << (ch & 0X7))) != 0) { return true; } } cursor++; } return false; } bool out_grouping_b(String s, int min, int max) { if (cursor <= limit_backward) return false; int ch = current.codeUnitAt(cursor - 1); if (ch > max || ch < min) { cursor--; return true; } ch -= min; if ((s.codeUnitAt(ch >> 3) & (0X1 << (ch & 0X7))) == 0) { cursor--; return true; } return false; } bool go_out_grouping_b(String s, int min, int max) { while (cursor > limit_backward) { int ch = current.codeUnitAt(cursor - 1); if (ch <= max && ch >= min) { ch -= min; if ((s.codeUnitAt(ch >> 3) & (0X1 << (ch & 0X7))) != 0) { return true; } } cursor--; } return false; } bool eq_s(String s) { if (limit - cursor < s.length) return false; if (!current.startsWith(s, cursor)) return false; cursor += s.length; return true; } bool eq_s_b(String s) { if (cursor - limit_backward < s.length) return false; if (!current.substring(0, cursor).endsWith(s)) return false; cursor -= s.length; return true; } int find_among(List v, bool Function()? call_among_func) { int i = 0; int j = v.length; int c = cursor; int l = limit; int common_i = 0; int common_j = 0; bool first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; // smaller Among w = v[k]; for (int i2 = common; i2 < w.s.length; i2++) { if (c + common == l) { diff = -1; break; } diff = current.codeUnitAt(c + common) - w.s.codeUnitAt(i2); if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j == i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.s.length) { cursor = c + w.s.length; if (w.function_index == 0) return w.result; af = w.function_index; if (call_among_func?.call() ?? false) { cursor = c + w.s.length; return w.result; } } i = w.substring_i; if (i < 0) return 0; } } int find_among_b(List v, bool Function()? call_among_func) { int i = 0; int j = v.length; int c = cursor; int lb = limit_backward; int common_i = 0; int common_j = 0; bool first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; Among w = v[k]; for (int i2 = w.s.length - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = current.codeUnitAt(c - 1 - common) - w.s.codeUnitAt(i2); if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.s.length) { cursor = c - w.s.length; if (w.function_index == 0) return w.result; af = w.function_index; if (call_among_func?.call() ?? false) { cursor = c - w.s.length; return w.result; } } i = w.substring_i; if (i < 0) return 0; } } int replace_s(int c_bra, int c_ket, String s) { final adjustment = s.length - (c_ket - c_bra); current = current.replaceRange(c_bra, c_ket, s); limit += adjustment; if (cursor >= c_ket) { cursor += adjustment; } else if (cursor > c_bra) { cursor = c_bra; } return adjustment; } void slice_check() { assert(bra >= 0, 'bra=$bra'); assert(bra <= ket, 'bra=$bra,ket=$ket'); assert(ket <= limit, 'ket=$ket,limit=$limit'); assert(limit <= current.length, 'limit=$limit,length=${current.length}'); } void slice_from(String s) { slice_check(); replace_s(bra, ket, s); ket = bra + s.length; } void slice_del() { slice_from(""); } void insert(int c_bra, int c_ket, String s) { int adjustment = replace_s(c_bra, c_ket, s); if (c_bra <= bra) bra += adjustment; if (c_bra <= ket) ket += adjustment; } String slice_to() { slice_check(); return current.substring(bra, ket); } String assign_to() { return current.substring(0, limit); } } abstract class SnowballStemmer extends SnowballProgram { bool stem(); } snowball-3.1.0/dart/pubspec.yaml000066400000000000000000000005701520373054300166170ustar00rootroot00000000000000name: snowball description: This package provides stemmers generated from Snowball algorithms. version: 3.1.0 # SNOWBALL_VERSION repository: https://github.com/snowballstem/snowball issue_tracker: https://github.com/snowballstem/snowball/issues topics: - stemmer environment: sdk: ^3.8.1 dependencies: # path: ^1.8.0 dev_dependencies: lints: ^5.0.0 test: ^1.24.0 snowball-3.1.0/doc/000077500000000000000000000000001520373054300141035ustar00rootroot00000000000000snowball-3.1.0/doc/TODO000066400000000000000000000010221520373054300145660ustar00rootroot00000000000000Things to do: - Write documentation for how to use libstemmer (as opposed to how stemming algorithms themselves work). Currently, the documentation in the include/libstemmer.h header file is pretty clear and comprehensive, but an overview document wouldn't go amiss. Things that would be nice to include at some point. - Add version numbers to each stemming algorithm, and allow the interface to request a specific version of the stemming algorithms. Default to providing the latest version of the algorithm. snowball-3.1.0/doc/libstemmer_c_README000066400000000000000000000145111520373054300175120ustar00rootroot00000000000000libstemmer_c ============ This document pertains to the C version of the libstemmer distribution, available for download from: https://snowballstem.org/download.html What is Stemming? ----------------- Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. Compiling the library ===================== A simple makefile is provided for Unix style systems. On such systems, it should be possible simply to run "make", and the file "libstemmer.o" and the example program "stemwords" will be generated. If this doesn't work on your system, you need to write your own build system (or call the compiler directly). The files to compile are all contained in the "libstemmer", "runtime" and "src_c" directories, and the public header file is contained in the "include" directory. The library comes in two flavours; UTF-8 only, and UTF-8 plus other character sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of "libstemmer.c". For convenience "mkinc.mak" is a makefile fragment listing the source files and header files used to compile the standard version of the library. "mkinc_utf8.mak" is a comparable makefile fragment listing just the source files for the UTF-8 only version of the library. Using the library ================= The stemming algorithms generally expect the input text to use composed accents (Unicode NFC or NFKC) and to have been folded to lower case already. The library provides a simple C API. Essentially, a new stemmer can be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then used to stem a word, "sb_stemmer_length" returns the stemmed length of the last word processed, and "sb_stemmer_delete" is used to delete a stemmer. Generally you should create a stemmer object and reuse it rather than creating a fresh object for each word stemmed (since there's some cost to creating and destroying the object). The stemmer code is re-entrant, but not thread-safe if the same stemmer object is used concurrently in different threads. If you want to perform stemming concurrently in different threads, we suggest creating a new stemmer object for each thread. The alternative is to share stemmer objects between threads and protect access using a mutex or similar but that's liable to slow your program down as threads can end up waiting for the lock. libstemmer does not currently incorporate any mechanism for caching the results of stemming operations. Such caching can greatly increase the performance of a stemmer under certain situations, so suitable patches will be considered for inclusion. The standard libstemmer sources contain an algorithm for each of the supported languages. The algorithm may be selected using the english name of the language, or using the 2 or 3 letter ISO 639 language codes. In addition, the traditional "Porter" stemming algorithm for english is included for backwards compatibility purposes, but we recommend use of the "English" stemmer in preference for new projects. (Some minor algorithms which are included only as curiosities in the snowball website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not included in the standard libstemmer sources. These are not really supported by the snowball project, but it would be possible to compile a modified libstemmer library containing these if desired.) The stemwords example ===================== The stemwords example program allows you to run any of the stemmers compiled into the libstemmer library on a sample vocabulary. For details on how to use it, run it with the "-h" command line option. Using the library in a larger system ==================================== If you are incorporating the library into the build system of a larger program, I recommend copying the unpacked tarball without modification into a subdirectory of the sources of your program. Future versions of the library are intended to keep the same structure, so this will keep the work required to move to a new version of the library to a minimum. As an additional convenience, the list of source and header files used in the library is detailed in mkinc.mak - a file which is in a suitable format for inclusion by a Makefile. By including this file in your build system, you can link the snowball system into your program with a few extra rules. Using the library in a system using GNU autotools ================================================= The libstemmer_c library can be integrated into a larger system which uses the GNU autotool framework (and in particular, automake and autoconf) as follows: 1) Unpack libstemmer_c-*.tar.gz in the top level project directory and rename the resulting directory to remove the version number so that there is a libstemmer_c subdirectory of the top level directory of the project. 2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing: noinst_LTLIBRARIES = libstemmer.la include $(srcdir)/mkinc.mak noinst_HEADERS = $(snowball_headers) libstemmer_la_SOURCES = $(snowball_sources) (You may also need to add other lines to this, for example, if you are using compiler options which are not compatible with compiling the libstemmer library.) 3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's configure.ac file. 4) Add to the top level makefile the following lines (or modify existing assignments to these variables appropriately): AUTOMAKE_OPTIONS = subdir-objects AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include SUBDIRS=libstemmer_c _LIBADD = libstemmer_c/libstemmer.la (Where is the name of the library or executable which links against libstemmer.) snowball-3.1.0/doc/libstemmer_csharp_README000066400000000000000000000042101520373054300205430ustar00rootroot00000000000000libstemmer_csharp ================= This document pertains to the C# version of the libstemmer distribution, available for download from: https://snowballstem.org/download.html What is Stemming? ----------------- Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. Compiling the library ===================== To build a library:: mcs -target:library -out:snowballstemmer.dll csharp/Snowball/*.cs csharp/Snowball/Algorithms/*cs And to build the example program using that library:: mcs -target:exe -out:stemwords.exe -r:snowballstemmer.dll csharp/Stemwords/Program.cs Using the library ================= The stemming algorithms generally expect the input text to use composed accents (Unicode NFC or NFKC) and to have been folded to lower case already. There is currently no formal documentation on the use of the C# version of the library. Additionally, its interface is not guaranteed to be stable. The stemmer code is re-entrant, but not thread-safe if the same stemmer object is used concurrently in different threads. If you want to perform stemming concurrently in different threads, we suggest creating a new stemmer object for each thread. The alternative is to share stemmer objects between threads and protect access using a mutex or similar but that's liable to slow your program down as threads can end up waiting for the lock. snowball-3.1.0/doc/libstemmer_dart_README000066400000000000000000000027631520373054300202300ustar00rootroot00000000000000# snowball This package provides stemmers generated from Snowball algorithms. ### What is Stemming? Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. ### How to use library The stemming algorithms generally expect the input text to use composed accents (Unicode NFC or NFKC) and to have been folded to lower case already. The ``snowball`` package has two functions. ```dart import 'package:snowball/snowball.dart'; final stemmer = SnowballStemmer(Algorithm.english); print(stemmer.stem('walking')); ``` Generally you should create a stemmer object and reuse it rather than creating a fresh object for each word stemmed (since there's some cost to creating and destroying the object). snowball-3.1.0/doc/libstemmer_java_README000066400000000000000000000054451520373054300202170ustar00rootroot00000000000000libstemmer_java =============== This document pertains to the Java version of the libstemmer distribution, available for download from: https://snowballstem.org/download.html What is Stemming? ----------------- Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. Requirements ============ The Java code generated by Snowball requires Java >= 7 (since Snowball 3.0.0). Java 7 was released in 2011, and Java 6's EOL was 2013 so we don't expect this to be a problematic requirement. Compiling the library ===================== Simply run the java compiler on all the java source files under the java directory. For example, this can be done under unix by changing directory into the java directory, and running: javac org/tartarus/snowball/*.java org/tartarus/snowball/ext/*.java This will compile the library and also an example program "TestApp" which provides a command line interface to the library. Using the library ================= The stemming algorithms generally expect the input text to use composed accents (Unicode NFC or NFKC) and to have been folded to lower case already. There is currently no formal documentation on the use of the Java version of the library. Additionally, its interface is not guaranteed to be stable. The best documentation of the library is the source of the TestApp example program. The stemmer code is re-entrant, but not thread-safe if the same stemmer object is used concurrently in different threads. If you want to perform stemming concurrently in different threads, we suggest creating a new stemmer object for each thread. The alternative is to share stemmer objects between threads and protect access using a mutex or similar but that's liable to slow your program down as threads can end up waiting for the lock. The TestApp example =================== The TestApp example program allows you to run any of the stemmers compiled into the libstemmer library on a sample vocabulary. For details on how to use it, run it with no command line parameters. snowball-3.1.0/doc/libstemmer_js_README000066400000000000000000000040511520373054300177020ustar00rootroot00000000000000Snowball stemming library collection for Javascript =================================================== What is Stemming? ----------------- Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. How to use library ------------------ The stemming algorithms generally expect the input text to use composed accents (Unicode NFC or NFKC) and to have been folded to lower case already. You can use each stemming modules from Javascript code - e.g to use them with node: .. code-block:: javascript let EnglishStemmer = require('english-stemmer.js'); let stemmer = new EnglishStemmer(); console.log(stemmer.stemWord("testing")); You'll need to bundle ``base-stemmer.js`` and whichever languages you want stemmers for (e.g. ``english-stemmer.js`` for English). FIXME: Document how to use in a web browser. The stemmer code is re-entrant, but not thread-safe if the same stemmer object is used concurrently in different threads. If you want to perform stemming concurrently in different threads, we suggest creating a new stemmer object for each thread. The alternative is to share stemmer objects between threads and protect access using a mutex or similar but that's liable to slow your program down as threads can end up waiting for the lock. snowball-3.1.0/doc/libstemmer_php_README000066400000000000000000000025551520373054300200640ustar00rootroot00000000000000Snowball stemming library for PHP ================================= This is an automatically-generated pure PHP implementation of the Snowball stemming algorithms. The generated code is compatible with PHP 8.3 and later. The `mbstring` extension is required. It would be possible to extend support to older PHP (back to 7.4) if there's demand for it. Please contact us if this would be useful to you. This library is intended for situations where the convenience of a pure PHP implementation is more important than performance. If performance matters we recommend using the C implementation via bindings instead, (for example, using https://github.com/amaccis/php-stemmer). If you want both the convenience of pure PHP and reasonable performance, we suggest avoiding PHP 8.3 as we've noticed the generated stemmers seem to run 4-5 times faster with PHP >= 8.4 than with 8.3 (this performance gain seems much higher than we've seen reported for other PHP code, presumably due to something atypical the generated stemmer code does). How to use library ------------------ You can stem a word from PHP as follows: .. code-block:: php require 'base-stemmer.php'; require 'english-stemmer.php'; $stemmer = new SnowballEnglishStemmer; $word = 'compiled'; $stem = $stemmer->stemWord($word); printf("%s => %s\n", $word, $stem ); // prints "compiled => compil" snowball-3.1.0/doc/libstemmer_python_README000066400000000000000000000113041520373054300206060ustar00rootroot00000000000000Snowball stemming library collection for Python =============================================== Python 3 (>= 3.3) is supported. We no longer support Python 2 as the Python developers stopped supporting it at the start of 2020. Snowball 2.1.0 was the last release to officially support Python 2; Snowball 3.0.1 was the last release which had the code to support Python 2, but we were no longer testing it. What is Stemming? ----------------- Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a search for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. How to use library ------------------ The stemming algorithms generally expect the input text to use composed accents (Unicode NFC or NFKC) and to have been folded to lower case already. The ``snowballstemmer`` module has two functions. The ``snowballstemmer.algorithms`` function returns a list of available algorithm names. The ``snowballstemmer.stemmer`` function takes an algorithm name and returns a ``Stemmer`` object. ``Stemmer`` objects have a ``Stemmer.stemWord(word)`` method and a ``Stemmer.stemWords(word[])`` method. .. code-block:: python import snowballstemmer stemmer = snowballstemmer.stemmer('english') print(stemmer.stemWords("We are the world".split())) Generally you should create a stemmer object and reuse it rather than creating a fresh object for each word stemmed (since there's some cost to creating and destroying the object). The stemmer code is re-entrant, but not thread-safe if the same stemmer object is used concurrently in different threads. If you want to perform stemming concurrently in different threads, we suggest creating a new stemmer object for each thread. The alternative is to share stemmer objects between threads and protect access using a mutex or similar (e.g. `threading.Lock` in Python) but that's liable to slow your program down as threads can end up waiting for the lock. Automatic Acceleration ---------------------- `PyStemmer `_ is a wrapper module for Snowball's ``libstemmer_c`` and should provide results 100% compatible to **snowballstemmer**. **PyStemmer** is faster because it wraps generated C versions of the stemmers; **snowballstemmer** uses generate Python code and is slower but offers a pure Python solution. If PyStemmer is installed, ``snowballstemmer.stemmer`` returns a ``PyStemmer`` ``Stemmer`` object which provides the same ``Stemmer.stemWord()`` and ``Stemmer.stemWords()`` methods. Benchmark ~~~~~~~~~ This is a crude benchmark which measures the time for running each stemmer on every word in its sample vocabulary (10,787,583 words over 26 languages). It's not a realistic test of normal use as a real application would do much more than just stemming. It's also skewed towards the stemmers which do more work per word and towards those with larger sample vocabularies. * Python 2.7 + **snowballstemmer** : 13m00s (15.0 * PyStemmer) * Python 3.7 + **snowballstemmer** : 12m19s (14.2 * PyStemmer) * PyPy 7.1.1 (Python 2.7.13) + **snowballstemmer** : 2m14s (2.6 * PyStemmer) * PyPy 7.1.1 (Python 3.6.1) + **snowballstemmer** : 1m46s (2.0 * PyStemmer) * Python 2.7 + **PyStemmer** : 52s For reference the equivalent test for C runs in 9 seconds. These results are for Snowball 2.0.0. They're likely to evolve over time as the code Snowball generates for both Python and C continues to improve (for a much older test over a different set of stemmers using Python 2.7, **snowballstemmer** was 30 times slower than **PyStemmer**, or 9 times slower with **PyPy**). The message to take away is that if you're stemming a lot of words you should either install **PyStemmer** (which **snowballstemmer** will then automatically use for you as described above) or use PyPy. The TestApp example ------------------- The ``testapp.py`` example program allows you to run any of the stemmers on a sample vocabulary. Usage:: testapp.py "sentences ... " .. code-block:: bash $ python testapp.py English "sentences... " snowball-3.1.0/examples/000077500000000000000000000000001520373054300151545ustar00rootroot00000000000000snowball-3.1.0/examples/stemwords.c000066400000000000000000000150471520373054300173560ustar00rootroot00000000000000/* This is a simple program which uses libstemmer to provide a command * line interface for stemming using any of the algorithms provided. */ #include #include /* for malloc, free */ #include /* for memmove */ #include /* for isupper, tolower */ #ifdef __cplusplus // Support compiling as C++ to test code generated by `snowball -c++`. # include #endif #include "libstemmer.h" const char * progname; static int pretty = 1; static void stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out) { #define INC 10 int lim = INC; sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol)); while (1) { int ch = getc(f_in); if (ch == EOF) { free(b); return; } { int i = 0; int inlen = 0; while (ch != '\n' && ch != EOF) { if (i == lim) { sb_symbol * newb; newb = (sb_symbol *) realloc(b, (lim + INC) * sizeof(sb_symbol)); if (newb == NULL) goto error; b = newb; lim = lim + INC; } /* Update count of utf-8 characters. */ if (ch < 0x80 || ch > 0xBF) inlen += 1; /* force lower case: */ ch = tolower(ch); b[i] = ch; i++; ch = getc(f_in); } { const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i); if (stemmed == NULL) { fprintf(stderr, "Out of memory or internal error\n"); exit(1); } if (pretty == 1) { fwrite(b, i, 1, f_out); fputs(" -> ", f_out); } else if (pretty == 2) { fwrite(b, i, 1, f_out); if (sb_stemmer_length(stemmer) > 0) { int j; if (inlen < 30) { for (j = 30 - inlen; j > 0; j--) fputs(" ", f_out); } else { fputs("\n", f_out); for (j = 30; j > 0; j--) fputs(" ", f_out); } } } fputs((const char *)stemmed, f_out); putc('\n', f_out); } } } error: free(b); return; } /** Display the command line syntax, and then exit. * @param n The value to exit with. */ static void usage(int n) { printf("usage: %s [-l ] [-i ] [-o ] [-c ] [-p[2]] [-h]\n" "\n" "The input file consists of a list of words to be stemmed, one per\n" "line. Words should be in lower case, but (for English) A-Z letters\n" "are mapped to their a-z equivalents anyway. If omitted, stdin is\n" "used.\n" "\n" "If -c is given, the argument is the character encoding of the input\n" "and output files. If it is omitted, the UTF-8 encoding is used.\n" "\n", progname); printf( "If -p is given the output file consists of each word of the input\n" "file followed by \"->\" followed by its stemmed equivalent.\n" "If -p2 is given the output file is a two column layout containing\n" "the input words in the first column and the stemmed equivalents in\n" "the second column.\n" "Otherwise, the output file consists of the stemmed words, one per\n" "line.\n" "\n" "-h displays this help\n"); exit(n); } int main(int argc, char * argv[]) #ifdef __cplusplus try #endif { const char * in = NULL; const char * out = NULL; FILE * f_in; FILE * f_out; struct sb_stemmer * stemmer; const char * language = "english"; const char * charenc = NULL; int i = 1; pretty = 0; progname = argv[0]; while (i < argc) { const char * s = argv[i++]; if (s[0] == '-') { if (strcmp(s, "-o") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } out = argv[i++]; } else if (strcmp(s, "-i") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } in = argv[i++]; } else if (strcmp(s, "-l") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } language = argv[i++]; } else if (strcmp(s, "-c") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } charenc = argv[i++]; } else if (strcmp(s, "-p2") == 0) { pretty = 2; } else if (strcmp(s, "-p") == 0) { pretty = 1; } else if (strcmp(s, "-h") == 0) { usage(0); } else { fprintf(stderr, "option %s unknown\n", s); usage(1); } } else { fprintf(stderr, "unexpected parameter %s\n", s); usage(1); } } /* prepare the files */ f_in = (in == NULL) ? stdin : fopen(in, "r"); if (f_in == NULL) { fprintf(stderr, "file %s not found\n", in); exit(1); } f_out = (out == NULL) ? stdout : fopen(out, "w"); if (f_out == NULL) { fprintf(stderr, "file %s cannot be opened\n", out); exit(1); } /* do the stemming process: */ stemmer = sb_stemmer_new(language, charenc); if (stemmer == NULL) { if (charenc == NULL) { fprintf(stderr, "language `%s' not available for stemming\n", language); exit(1); } else { fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); exit(1); } } stem_file(stemmer, f_in, f_out); sb_stemmer_delete(stemmer); if (in != NULL) (void) fclose(f_in); if (out != NULL) (void) fclose(f_out); return 0; } #ifdef __cplusplus catch (const std::exception& e) { fprintf(stderr, "Exception: %s\n", e.what()); exit(1); } #endif snowball-3.1.0/go/000077500000000000000000000000001520373054300137435ustar00rootroot00000000000000snowball-3.1.0/go/README.md000066400000000000000000000043401520373054300152230ustar00rootroot00000000000000# Go Target for Snowball The initial implementation was built as a port of the Rust target. The initial focus has been on getting it to function, and making it work correctly. No attempt has been made to beautify the implementation, generated code, or address performance issues. ## Usage To generate Go source for a Snowball algorithm: ``` $ snowball path/to/algorithm.sbl -go -o algorithm ``` ### Go specific options `-Package`/`-P`: the package name used in the generated go file (defaults to `snowball`) `-goruntime`/`-gor`: the import path used for the Go Snowball runtime (defaults to `github.com/snowballstem/snowball/go`) Snowball 3.0.1 and earlier supported `-go`/`-gopackage`, but this was just an alias for `-Package`/`-P` since Snowball 2.0.0. ## Code Organization `compiler/generator_go.c` has the Go code generation logic `go/` contains the default Go Snowball runtime support `go/stemwords` contains the source for a Go version of the stemwords utility `go/algorithms` location where the makefile generated code will end up ## Using the Generated Stemmers Assuming you generated a stemmer, put that code in a package which is imported by this code as `english`. ``` env := snowball.NewEnv("") env.SetCurrent("beautiful") english.Stem(env) fmt.Printf("stemmed word is: %s", env.Current()) ``` If you are stemming many words you should reuse `env` as shown above. If you are stemming a single word, you can set the current string when you create it with `env:= snowball.NewEnv("beautiful")`, but doing this when stemming many words is not recommended as the overhead is measurable (stemwords for the arabic stemmer on the sample vocabulary is about 12% faster if you reuse `env`). ## Testing Only the existing Snowball algorithms have been used for testing. This does not exercise all features of the language. Run: ``` $ make check_go ``` An initial pass of fuzz-testing has been performed on the generated stemmers for the algorithms in this repo. Each ran for 5 minutes and used an initial corpus seeded with 10k words from the algorithm's snowballstem-data voc.txt file. ## Known Limitations - Code going through generate_dollar production has not been tested - Code going through generate_debug production has not been tested snowball-3.1.0/go/among.go000066400000000000000000000004071520373054300153740ustar00rootroot00000000000000package snowball import "fmt" type AmongF func(env *Env, ctx interface{}) bool type Among struct { Str string A int32 B int32 F AmongF } func (a *Among) String() string { return fmt.Sprintf("str: `%s`, a: %d, b: %d, f: %p", a.Str, a.A, a.B, a.F) } snowball-3.1.0/go/env.go000066400000000000000000000210411520373054300150600ustar00rootroot00000000000000package snowball import ( "log" "strings" "unicode/utf8" ) // Env represents the Snowball execution environment type Env struct { current string Cursor int Limit int LimitBackward int Bra int Ket int } // NewEnv creates a new Snowball execution environment on the provided string func NewEnv(val string) *Env { return &Env{ current: val, Cursor: 0, Limit: len(val), LimitBackward: 0, Bra: 0, Ket: len(val), } } func (env *Env) Current() string { return env.current } func (env *Env) SetCurrent(s string) { env.current = s env.Cursor = 0 env.Limit = len(s) env.LimitBackward = 0 env.Bra = 0 env.Ket = len(s) } func (env *Env) ReplaceS(bra, ket int, s string) int32 { adjustment := int32(len(s)) - (int32(ket) - int32(bra)) result := env.current[:bra] result += s rsplit := ket if ket < bra { rsplit = bra } result += env.current[rsplit:] newLim := int32(env.Limit) + adjustment env.Limit = int(newLim) if env.Cursor >= ket { newCur := int32(env.Cursor) + adjustment env.Cursor = int(newCur) } else if env.Cursor > bra { env.Cursor = bra } env.current = result return adjustment } func (env *Env) EqS(s string) bool { if env.Cursor >= env.Limit { return false } if strings.HasPrefix(env.current[env.Cursor:], s) { env.Cursor += len(s) for !onCharBoundary(env.current, env.Cursor) { env.Cursor++ } return true } return false } func (env *Env) EqSB(s string) bool { if int32(env.Cursor)-int32(env.LimitBackward) < int32(len(s)) { return false } else if !onCharBoundary(env.current, env.Cursor-len(s)) || !strings.HasPrefix(env.current[env.Cursor-len(s):], s) { return false } else { env.Cursor -= len(s) return true } } func (env *Env) SliceFrom(s string) { bra, ket := env.Bra, env.Ket env.ReplaceS(bra, ket, s) env.Ket = bra + len(s) } func (env *Env) NextChar() { env.Cursor++ for !onCharBoundary(env.current, env.Cursor) { env.Cursor++ } } func (env *Env) PrevChar() { env.Cursor-- for !onCharBoundary(env.current, env.Cursor) { env.Cursor-- } } func (env *Env) Hop(delta int) bool { res := env.Cursor for delta > 0 { delta-- if res >= env.Limit { return false } res++ for res < env.Limit && !onCharBoundary(env.current, res) { res++ } } env.Cursor = res return true } func (env *Env) HopChecked(delta int) bool { return delta >= 0 && env.Hop(delta) } func (env *Env) HopBack(delta int) bool { res := env.Cursor for delta > 0 { delta-- if res <= env.LimitBackward { return false } res-- for res > env.LimitBackward && !onCharBoundary(env.current, res) { res-- } } env.Cursor = res return true } func (env *Env) HopBackChecked(delta int) bool { return delta >= 0 && env.HopBack(delta) } func (env *Env) InGrouping(chars []byte, min, max int32) bool { if env.Cursor >= env.Limit { return false } r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { return false } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { return false } env.NextChar() return true } func (env *Env) GoInGrouping(chars []byte, min, max int32) bool { for env.Cursor < env.Limit { r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { return true } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { return true } env.NextChar() } return false } func (env *Env) InGroupingB(chars []byte, min, max int32) bool { if env.Cursor <= env.LimitBackward { return false } c := env.Cursor env.PrevChar() r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { env.Cursor = c return false } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { env.Cursor = c return false } return true } func (env *Env) GoInGroupingB(chars []byte, min, max int32) bool { for env.Cursor > env.LimitBackward { c := env.Cursor env.PrevChar() r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { env.Cursor = c return true } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { env.Cursor = c return true } } return false } func (env *Env) OutGrouping(chars []byte, min, max int32) bool { if env.Cursor >= env.Limit { return false } r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { env.NextChar() return true } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { env.NextChar() return true } return false } func (env *Env) GoOutGrouping(chars []byte, min, max int32) bool { for env.Cursor < env.Limit { r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r <= max && r >= min { r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) != 0 { return true } } env.NextChar() } return false } func (env *Env) OutGroupingB(chars []byte, min, max int32) bool { if env.Cursor <= env.LimitBackward { return false } c := env.Cursor env.PrevChar() r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r > max || r < min { return true } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { return true } env.Cursor = c return false } func (env *Env) GoOutGroupingB(chars []byte, min, max int32) bool { for env.Cursor > env.LimitBackward { c := env.Cursor env.PrevChar() r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r == utf8.RuneError { return false } if r <= max && r >= min { r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) != 0 { env.Cursor = c return true } } } return false } func (env *Env) SliceDel() { env.SliceFrom("") } func (env *Env) Insert(bra, ket int, s string) { adjustment := env.ReplaceS(bra, ket, s) if bra <= env.Bra { env.Bra = int(int32(env.Bra) + adjustment) } if bra <= env.Ket { env.Ket = int(int32(env.Ket) + adjustment) } } func (env *Env) SliceTo() string { return env.current[env.Bra:env.Ket] } func (env *Env) FindAmong(amongs []*Among, ctx interface{}) int32 { var i int32 j := int32(len(amongs)) c := env.Cursor l := env.Limit var commonI, commonJ int firstKeyInspected := false for { k := i + ((j - i) >> 1) var diff int32 common := min(commonI, commonJ) w := amongs[k] for lvar := common; lvar < len(w.Str); lvar++ { if c+common == l { diff-- break } diff = int32(env.current[c+common]) - int32(w.Str[lvar]) if diff != 0 { break } common++ } if diff < 0 { j = k commonJ = common } else { i = k commonI = common } if j-i <= 1 { if i > 0 { break } if j == i { break } if firstKeyInspected { break } firstKeyInspected = true } } for { w := amongs[i] if commonI >= len(w.Str) { env.Cursor = c + len(w.Str) if w.F != nil { if w.F(env, ctx) { env.Cursor = c + len(w.Str) return w.B } } else { return w.B } } i = w.A if i < 0 { return 0 } } } func (env *Env) FindAmongB(amongs []*Among, ctx interface{}) int32 { var i int32 j := int32(len(amongs)) c := env.Cursor lb := env.LimitBackward var commonI, commonJ int firstKeyInspected := false for { k := i + ((j - i) >> 1) diff := int32(0) common := min(commonI, commonJ) w := amongs[k] for lvar := len(w.Str) - int(common) - 1; lvar >= 0; lvar-- { if c-common == lb { diff-- break } diff = int32(env.current[c-common-1]) - int32(w.Str[lvar]) if diff != 0 { break } // Count up commons. But not one character but the byte width of that char common++ } if diff < 0 { j = k commonJ = common } else { i = k commonI = common } if j-i <= 1 { if i > 0 { break } if j == i { break } if firstKeyInspected { break } firstKeyInspected = true } } for { w := amongs[i] if commonI >= len(w.Str) { env.Cursor = c - len(w.Str) if w.F != nil { if w.F(env, ctx) { env.Cursor = c - len(w.Str) return w.B } } else { return w.B } } i = w.A if i < 0 { return 0 } } } func (env *Env) Debug(count, lineNumber int) { log.Printf("snowball debug, count: %d, line: %d", count, lineNumber) } func (env *Env) Clone() *Env { clone := *env return &clone } func (env *Env) AssignTo() string { return env.Current() } snowball-3.1.0/go/stemwords/000077500000000000000000000000001520373054300157725ustar00rootroot00000000000000snowball-3.1.0/go/stemwords/generate.go000066400000000000000000000020421520373054300201110ustar00rootroot00000000000000// +build ignore package main import ( "flag" "fmt" "io" "io/ioutil" "log" "os" ) // tool to register all algorithms built with the stemwords tool func main() { flag.Parse() if flag.NArg() < 1 { log.Fatal("must specify algorithms directory") } var w io.Writer if flag.NArg() > 1 { var err error w, err = os.Create(flag.Arg(1)) if err != nil { log.Fatalf("error creating output file %v", err) } } else { w = os.Stdout } fmt.Fprintf(w, "%s", header) files, err := ioutil.ReadDir(flag.Arg(0)) if err != nil { log.Fatal(err) } for _, file := range files { fmt.Fprintf(w, " %s \"github.com/snowballstem/snowball/go/algorithms/%s\"\n", file.Name(), file.Name()) } fmt.Fprintf(w, closeImportStartInit) for _, file := range files { fmt.Fprintf(w, " languages[\"%s\"] = %s.Stem\n", file.Name(), file.Name()) } fmt.Fprintf(w, "%s", footer) } var header = `// generated list of supported algorithms, DO NOT EDIT package main import ( ` var closeImportStartInit = `) func init() {` var footer = `} ` snowball-3.1.0/go/stemwords/main.go000066400000000000000000000023511520373054300172460ustar00rootroot00000000000000//go:generate go run generate.go ../algorithms algorithms.go //go:generate gofmt -s -w algorithms.go package main import ( "bufio" "flag" "fmt" "log" "os" snowballRuntime "github.com/snowballstem/snowball/go" ) var language = flag.String("l", "", "language") var input = flag.String("i", "", "input file") var output = flag.String("o", "", "output file") func main() { flag.Parse() if *language == "" { log.Fatal("must specify language") } stemmer, ok := languages[*language] if !ok { log.Fatalf("no language support for %s", *language) } var reader = os.Stdin if *input != "" { var err error reader, err = os.Open(*input) if err != nil { log.Fatal(err) } defer reader.Close() } var writer = os.Stdout if *output != "" { var err error writer, err = os.Create(*output) if err != nil { log.Fatal(err) } defer writer.Close() } var err error scanner := bufio.NewScanner(reader) env := snowballRuntime.NewEnv("") for scanner.Scan() { word := scanner.Text() env.SetCurrent(word) stemmer(env) fmt.Fprintf(writer, "%s\n", env.Current()) } if err = scanner.Err(); err != nil { log.Fatal(err) } } type StemFunc func(env *snowballRuntime.Env) bool var languages = make(map[string]StemFunc) snowball-3.1.0/go/util.go000066400000000000000000000010561520373054300152510ustar00rootroot00000000000000package snowball import ( "math" "unicode/utf8" ) const MaxInt = math.MaxInt32 const MinInt = math.MinInt32 func min(a, b int) int { if a < b { return a } return b } func onCharBoundary(s string, pos int) bool { if pos <= 0 || pos >= len(s) { return true } return utf8.RuneStart(s[pos]) } // RuneCountInString is a wrapper around utf8.RuneCountInString // this allows us to not have to conditionally include // the utf8 package into some stemmers and not others func RuneCountInString(str string) int { return utf8.RuneCountInString(str) } snowball-3.1.0/iconv.py000066400000000000000000000024211520373054300150250ustar00rootroot00000000000000#!env python # Simple (but slow) iconv replacement in Python. import sys in_cs = out_cs = in_file = out_file = pending = None for arg in sys.argv[1:]: if pending != None: arg = pending + arg pending = None if arg.startswith('-'): if arg[1] in ('f', 't', 'o'): if len(arg) == 2: pending = arg continue if arg[1] == 'f': in_cs = arg[2:] continue if arg[1] == 't': out_cs = arg[2:] continue if arg[1] == 'o': out_file = open(arg[2:], 'wb') continue print("Unknown option: '%s'" % arg) sys.exit(1) if in_file == None: in_file = open(arg, 'rb') continue print("Too many arguments") sys.exit(1) if in_cs == None: print("Need to specify input cs with -f") sys.exit(1) if out_cs == None: print("Need to specify output cs with -t") sys.exit(1) if in_file == None: if hasattr(sys.stdin, 'buffer'): in_file = sys.stdin.buffer else: in_file = sys.stdin if out_file == None: if hasattr(sys.stdout, 'buffer'): out_file = sys.stdout.buffer else: out_file = sys.stdout out_file.write(in_file.read().decode(in_cs).encode(out_cs)) snowball-3.1.0/include/000077500000000000000000000000001520373054300147615ustar00rootroot00000000000000snowball-3.1.0/include/libstemmer.h000066400000000000000000000060431520373054300173000ustar00rootroot00000000000000 /* Make header file work when included from C++ */ #ifdef __cplusplus extern "C" { #endif struct sb_stemmer; typedef unsigned char sb_symbol; /* FIXME - should be able to get a version number for each stemming * algorithm (which will be incremented each time the output changes). */ /** Returns an array of the names of the available stemming algorithms. * Note that these are the canonical names - aliases (ie, other names for * the same algorithm) will not be included in the list. * The list is terminated with a null pointer. * * The list must not be modified in any way. */ const char ** sb_stemmer_list(void); /** Create a new stemmer object, using the specified algorithm, for the * specified character encoding. * * All algorithms will usually be available in UTF-8, but may also be * available in other character encodings. * * @param algorithm The algorithm name. This is either the english * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the * language. Note that case is significant in this parameter - the * value should be supplied in lower case. * * @param charenc The character encoding. NULL may be passed as * this value, in which case UTF-8 encoding will be assumed. Otherwise, * the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1), * "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is * significant in this parameter. * * @return NULL if the specified algorithm is not recognised, or the * algorithm is not available for the requested encoding. Otherwise, * returns a pointer to a newly created stemmer for the requested algorithm. * The returned pointer must be deleted by calling sb_stemmer_delete(). * * @note NULL will also be returned if an out of memory error occurs. */ struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc); /** Delete a stemmer object. * * This frees all resources allocated for the stemmer. After calling * this function, the supplied stemmer may no longer be used in any way. * * It is safe to pass a null pointer to this function - this will have * no effect. */ void sb_stemmer_delete(struct sb_stemmer * stemmer); /** Stem a word. * * The stemming algorithms generally expect the input text to use composed * accents (Unicode NFC or NFKC) and to have been folded to lower case * already. * * The return value is owned by the stemmer - it must not be freed or * modified, and it will become invalid when the stemmer is called again, * or if the stemmer is freed. * * The length of the return value can be obtained using sb_stemmer_length(). * * If an out-of-memory error occurs, this will return NULL. */ const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size); /** Get the length of the result of the last stemmed word. * This should not be called before sb_stemmer_stem() has been called. */ int sb_stemmer_length(struct sb_stemmer * stemmer); #ifdef __cplusplus } #endif snowball-3.1.0/java/000077500000000000000000000000001520373054300142575ustar00rootroot00000000000000snowball-3.1.0/java/org/000077500000000000000000000000001520373054300150465ustar00rootroot00000000000000snowball-3.1.0/java/org/tartarus/000077500000000000000000000000001520373054300167135ustar00rootroot00000000000000snowball-3.1.0/java/org/tartarus/snowball/000077500000000000000000000000001520373054300205345ustar00rootroot00000000000000snowball-3.1.0/java/org/tartarus/snowball/Among.java000066400000000000000000000033431520373054300224430ustar00rootroot00000000000000package org.tartarus.snowball; import java.lang.invoke.MethodHandle; import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodType; import java.util.Locale; /** * Internal class used by Snowball stemmers */ public class Among { public Among (String s, int substring_i, int result) { this.s = s.toCharArray(); this.substring_i = substring_i; this.result = result; this.method = null; } public Among (String s, int substring_i, int result, String methodname, MethodHandles.Lookup methodobject) { this.s = s.toCharArray(); this.substring_i = substring_i; this.result = result; final Class clazz = methodobject.lookupClass().asSubclass(SnowballProgram.class); if (methodname.length() > 0) { try { this.method = methodobject.findVirtual(clazz, methodname, MethodType.methodType(boolean.class)) .asType(MethodType.methodType(boolean.class, SnowballProgram.class)); } catch (NoSuchMethodException | IllegalAccessException e) { throw new RuntimeException(String.format(Locale.ENGLISH, "Snowball program '%s' is broken, cannot access method: boolean %s()", clazz.getSimpleName(), methodname ), e); } } else { this.method = null; } } final char[] s; /* search string */ final int substring_i; /* index to longest matching substring */ final int result; /* result of the lookup */ // Make sure this is not accessible outside package for Java security reasons! final MethodHandle method; /* method to use if substring matches */ } snowball-3.1.0/java/org/tartarus/snowball/CharArraySequence.java000066400000000000000000000015041520373054300247440ustar00rootroot00000000000000package org.tartarus.snowball; /** * Internal class used by Snowball stemmers */ public class CharArraySequence implements CharSequence { public CharArraySequence(char[] a, int len) { this.a = a; this.len = len; } final public char charAt(int index) { if (index < 0 || index >= len) { throw new StringIndexOutOfBoundsException(index); } return a[index]; } final public int length() { return len; } final public CharSequence subSequence(int start, int end) { // Not needed for how we used CharSequence. throw new UnsupportedOperationException(); } final public String toString() { if (a == null) return ""; return new String(a, 0, len); } final private char[] a; final private int len; } snowball-3.1.0/java/org/tartarus/snowball/SnowballProgram.java000066400000000000000000000312561520373054300245170ustar00rootroot00000000000000 package org.tartarus.snowball; import java.lang.reflect.UndeclaredThrowableException; import java.util.Arrays; /** * Base class for a snowball stemmer */ public class SnowballProgram { protected SnowballProgram() { cursor = 0; length = limit = 0; limit_backward = 0; bra = cursor; ket = limit; } /** * Set the current string. */ public void setCurrent(String value) { setCurrent(value.toCharArray(), value.length()); } /** * Get the current string. */ public String getCurrent() { return new String(current, 0, length); } /** * Set the current string. * @param text character array containing input * @param length valid length of text. */ public void setCurrent(char[] text, int length) { current = text; cursor = 0; this.length = limit = length; limit_backward = 0; bra = cursor; ket = limit; } /** * Get the current buffer containing the stem. *

* NOTE: this may be a reference to a different character array than the * one originally provided with setCurrent, in the exceptional case that * stemming produced a longer intermediate or result string. *

*

* It is necessary to use {@link #getCurrentBufferLength()} to determine * the valid length of the returned buffer. For example, many words are * stemmed simply by subtracting from the length to remove suffixes. *

* @see #getCurrentBufferLength() */ public char[] getCurrentBuffer() { return current; } /** * Get the valid length of the character array in * {@link #getCurrentBuffer()}. * @return valid length of the array. */ public int getCurrentBufferLength() { return length; } // current string protected char[] current; protected int cursor; protected int length; protected int limit; protected int limit_backward; protected int bra; protected int ket; public SnowballProgram(SnowballProgram other) { current = other.current; cursor = other.cursor; length = other.length; limit = other.limit; limit_backward = other.limit_backward; bra = other.bra; ket = other.ket; } protected void copy_from(SnowballProgram other) { current = other.current; cursor = other.cursor; length = other.length; limit = other.limit; limit_backward = other.limit_backward; bra = other.bra; ket = other.ket; } protected boolean in_grouping(char[] s, int min, int max) { if (cursor >= limit) return false; int ch = current[cursor]; if (ch > max || ch < min) return false; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; cursor++; return true; } protected boolean go_in_grouping(char[] s, int min, int max) { while (cursor < limit) { int ch = current[cursor]; if (ch > max || ch < min) return true; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return true; cursor++; } return false; } protected boolean in_grouping_b(char[] s, int min, int max) { if (cursor <= limit_backward) return false; int ch = current[cursor - 1]; if (ch > max || ch < min) return false; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; cursor--; return true; } protected boolean go_in_grouping_b(char[] s, int min, int max) { while (cursor > limit_backward) { int ch = current[cursor - 1]; if (ch > max || ch < min) return true; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return true; cursor--; } return false; } protected boolean out_grouping(char[] s, int min, int max) { if (cursor >= limit) return false; int ch = current[cursor]; if (ch > max || ch < min) { cursor++; return true; } ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { cursor++; return true; } return false; } protected boolean go_out_grouping(char[] s, int min, int max) { while (cursor < limit) { int ch = current[cursor]; if (ch <= max && ch >= min) { ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) != 0) { return true; } } cursor++; } return false; } protected boolean out_grouping_b(char[] s, int min, int max) { if (cursor <= limit_backward) return false; int ch = current[cursor - 1]; if (ch > max || ch < min) { cursor--; return true; } ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { cursor--; return true; } return false; } protected boolean go_out_grouping_b(char[] s, int min, int max) { while (cursor > limit_backward) { int ch = current[cursor - 1]; if (ch <= max && ch >= min) { ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) != 0) { return true; } } cursor--; } return false; } protected boolean eq_s(CharSequence s) { if (limit - cursor < s.length()) return false; int i; for (i = 0; i != s.length(); i++) { if (current[cursor + i] != s.charAt(i)) return false; } cursor += s.length(); return true; } protected boolean eq_s_b(CharSequence s) { if (cursor - limit_backward < s.length()) return false; int i; for (i = 0; i != s.length(); i++) { if (current[cursor - s.length() + i] != s.charAt(i)) return false; } cursor -= s.length(); return true; } protected int find_among(Among[] v) { int i = 0; int j = v.length; int c = cursor; int l = limit; int common_i = 0; int common_j = 0; boolean first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; // smaller Among w = v[k]; int i2; for (i2 = common; i2 < w.s.length; i2++) { if (c + common == l) { diff = -1; break; } diff = current[c + common] - w.s[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j == i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.s.length) { cursor = c + w.s.length; if (w.method == null) return w.result; try { if ((boolean) w.method.invokeExact(this)) { cursor = c + w.s.length; return w.result; } } catch (Error | RuntimeException e) { throw e; } catch (Throwable e) { throw new UndeclaredThrowableException(e); } } i = w.substring_i; if (i < 0) return 0; } } // find_among_b is for backwards processing. Same comments apply protected int find_among_b(Among[] v) { int i = 0; int j = v.length; int c = cursor; int lb = limit_backward; int common_i = 0; int common_j = 0; boolean first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; Among w = v[k]; int i2; for (i2 = w.s.length - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = current[c - 1 - common] - w.s[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.s.length) { cursor = c - w.s.length; if (w.method == null) return w.result; try { if ((boolean) w.method.invokeExact(this)) { cursor = c - w.s.length; return w.result; } } catch (Error | RuntimeException e) { throw e; } catch (Throwable e) { throw new UndeclaredThrowableException(e); } } i = w.substring_i; if (i < 0) return 0; } } /* to replace chars between c_bra and c_ket in current by the * chars in s. */ protected int replace_s(int c_bra, int c_ket, CharSequence s) { final int adjustment = s.length() - (c_ket - c_bra); final int newLength = length + adjustment; //resize if necessary if (newLength > current.length) { current = Arrays.copyOf(current, newLength); } // if the substring being replaced is longer or shorter than the // replacement, need to shift things around if (adjustment != 0 && c_ket < length) { System.arraycopy(current, c_ket, current, c_bra + s.length(), length - c_ket); } // insert the replacement text // Note, faster is s.getChars(0, s.length(), current, c_bra); // but would have to duplicate this method for both String and StringBuilder for (int i = 0; i < s.length(); i++) current[c_bra + i] = s.charAt(i); length += adjustment; limit += adjustment; if (cursor >= c_ket) cursor += adjustment; else if (cursor > c_bra) cursor = c_bra; return adjustment; } protected void slice_check() { assert bra >= 0 : "bra=" + bra; assert bra <= ket : "bra=" + bra + ",ket=" + ket; assert limit <= length : "limit=" + limit + ",length=" + length; assert ket <= limit : "ket=" + ket + ",limit=" + limit; } protected void slice_from(CharSequence s) { slice_check(); replace_s(bra, ket, s); ket = bra + s.length(); } protected void slice_del() { slice_from(""); } protected void insert(int c_bra, int c_ket, CharSequence s) { int adjustment = replace_s(c_bra, c_ket, s); if (c_bra <= bra) bra += adjustment; if (c_bra <= ket) ket += adjustment; } /* extern void debug(struct SN_env * z, int number, int line_count) { int i; int limit = SIZE(z->p); //if (number >= 0) printf("%3d (line %4d): '", number, line_count); if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); for (i = 0; i <= limit; i++) { if (z->lb == i) printf("{"); if (z->bra == i) printf("["); if (z->c == i) printf("|"); if (z->ket == i) printf("]"); if (z->l == i) printf("}"); if (i < limit) { int ch = z->p[i]; if (ch == 0) ch = '#'; printf("%c", ch); } } printf("'\n"); } */ } snowball-3.1.0/java/org/tartarus/snowball/SnowballStemmer.java000066400000000000000000000003411520373054300245130ustar00rootroot00000000000000 package org.tartarus.snowball; /** * Parent class of all snowball stemmers, which must implement stem */ public abstract class SnowballStemmer extends SnowballProgram { public abstract boolean stem(); }; snowball-3.1.0/java/org/tartarus/snowball/TestApp.java000066400000000000000000000053351520373054300227650ustar00rootroot00000000000000 package org.tartarus.snowball; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.Writer; import java.nio.charset.StandardCharsets; import java.util.Arrays; public class TestApp { private static void usage() { System.err.println("Usage: TestApp [] [-o ]"); } private static SnowballStemmer getStemmer(String lang) { try { String c = "org.tartarus.snowball.ext." + lang + "Stemmer"; return (SnowballStemmer) Class.forName(c).getDeclaredConstructor().newInstance(); } catch (ReflectiveOperationException e) { return null; } } public static void main(String[] args) throws Throwable { if (args.length < 1) { usage(); return; } SnowballStemmer stemmer = getStemmer(args[0]); if (stemmer == null) { System.err.println("Stemmer " + args[0] + " not found"); return; } int arg = 1; InputStream instream; if (args.length > arg && !args[arg].equals("-o")) { instream = new FileInputStream(args[arg++]); } else { instream = System.in; } OutputStream outstream; if (args.length > arg) { if (args.length != arg + 2 || !args[arg].equals("-o")) { usage(); return; } outstream = new FileOutputStream(args[arg + 1]); } else { outstream = System.out; } Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8); reader = new BufferedReader(reader); Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8); output = new BufferedWriter(output); char[] input = new char[8]; int length = 0; int character; while ((character = reader.read()) != -1) { char ch = (char) character; if (Character.isWhitespace(ch)) { stemmer.setCurrent(input, length); stemmer.stem(); output.write(stemmer.getCurrentBuffer(), 0, stemmer.getCurrentBufferLength()); output.write('\n'); length = 0; } else { if (length == input.length) { input = Arrays.copyOf(input, length + 1); } input[length++] = ch < 127 ? Character.toLowerCase(ch) : ch; } } output.close(); } } snowball-3.1.0/javascript/000077500000000000000000000000001520373054300155045ustar00rootroot00000000000000snowball-3.1.0/javascript/base-stemmer.js000066400000000000000000000302731520373054300204330ustar00rootroot00000000000000// @ts-check export default class BaseStemmer { constructor() { /** @protected */ this.current = ''; this.c = 0; this.limit = 0; this.limit_backward = 0; this.bra = 0; this.ket = 0; this.af = 0; } /** * @param {string} value */ setCurrent(value) { this.current = value; this.c = 0; this.limit = this.current.length; this.limit_backward = 0; this.bra = this.c; this.ket = this.limit; } /** * @return {string} */ getCurrent() { return this.current; } /** * @param {BaseStemmer} other */ copy_from(other) { /** @protected */ this.current = other.current; this.c = other.c; this.limit = other.limit; this.limit_backward = other.limit_backward; this.bra = other.bra; this.ket = other.ket; } /** * @param {Array} s * @param {number} min * @param {number} max * @return {boolean} */ in_grouping(s, min, max) { /** @protected */ if (this.c >= this.limit) return false; let ch = this.current.charCodeAt(this.c); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) === 0) return false; this.c++; return true; } /** * @param {Array} s * @param {number} min * @param {number} max * @return {boolean} */ go_in_grouping(s, min, max) { /** @protected */ while (this.c < this.limit) { let ch = this.current.charCodeAt(this.c); if (ch > max || ch < min) return true; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) === 0) return true; this.c++; } return false; } /** * @param {Array} s * @param {number} min * @param {number} max * @return {boolean} */ in_grouping_b(s, min, max) { /** @protected */ if (this.c <= this.limit_backward) return false; let ch = this.current.charCodeAt(this.c - 1); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) === 0) return false; this.c--; return true; } /** * @param {Array} s * @param {number} min * @param {number} max * @return {boolean} */ go_in_grouping_b(s, min, max) { /** @protected */ while (this.c > this.limit_backward) { let ch = this.current.charCodeAt(this.c - 1); if (ch > max || ch < min) return true; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) === 0) return true; this.c--; } return false; } /** * @param {Array} s * @param {number} min * @param {number} max * @return {boolean} */ out_grouping(s, min, max) { /** @protected */ if (this.c >= this.limit) return false; let ch = this.current.charCodeAt(this.c); if (ch > max || ch < min) { this.c++; return true; } ch -= min; if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) === 0) { this.c++; return true; } return false; } /** * @param {Array} s * @param {number} min * @param {number} max * @return {boolean} */ go_out_grouping(s, min, max) { /** @protected */ while (this.c < this.limit) { let ch = this.current.charCodeAt(this.c); if (ch <= max && ch >= min) { ch -= min; if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) !== 0) { return true; } } this.c++; } return false; } /** * @param {Array} s * @param {number} min * @param {number} max * @return {boolean} */ out_grouping_b(s, min, max) { /** @protected */ if (this.c <= this.limit_backward) return false; let ch = this.current.charCodeAt(this.c - 1); if (ch > max || ch < min) { this.c--; return true; } ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) === 0) { this.c--; return true; } return false; } /** * @param {Array} s * @param {number} min * @param {number} max * @return {boolean} */ go_out_grouping_b(s, min, max) { /** @protected */ while (this.c > this.limit_backward) { let ch = this.current.charCodeAt(this.c - 1); if (ch <= max && ch >= min) { ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) !== 0) { return true; } } this.c--; } return false; } /** * @param {string} s * @return {boolean} */ eq_s(s) { /** @protected */ if (this.limit - this.c < s.length) return false; if (!this.current.startsWith(s, this.c)) return false; this.c += s.length; return true; } /** * @param {string} s * @return {boolean} */ eq_s_b(s) { /** @protected */ if (this.c - this.limit_backward < s.length) return false; if (!this.current.endsWith(s, this.c)) return false; this.c -= s.length; return true; } /** * @param {Array>} v * @param {?function(): boolean} call_among_func * @return {number} */ find_among(v, call_among_func) { /** @protected */ let i = 0; let j = v.length; const c = this.c; const l = this.limit; let common_i = 0; let common_j = 0; let first_key_inspected = false; while (true) { const k = i + ((j - i) >>> 1); let diff = 0; let common = common_i < common_j ? common_i : common_j; // smaller // w[0]: string, w[1]: result, w[2]: substring_i (optional), w[3]: function (optional) const w = v[k]; let i2; // @ts-expect-error: w[0] always string. for (i2 = common; i2 < w[0].length; i2++) { if (c + common === l) { diff = -1; break; } // @ts-expect-error: w[0] always string. diff = this.current.charCodeAt(c + common) - w[0].charCodeAt(i2); if (diff !== 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j === i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } while (true) { const w = v[i]; // @ts-expect-error: w[0] always string. if (common_i >= w[0].length) { // @ts-expect-error: w[0] always string. this.c = c + w[0].length; // @ts-expect-error: w[1] always number. if (w.length < 4) return w[1]; // @ts-expect-error: w[3] always number. this.af = w[3]; // @ts-expect-error: call_among_func never null here. if (call_among_func.call(this)) { // @ts-expect-error: w[0] always string. this.c = c + w[0].length; // @ts-expect-error: w[3] always number. return w[1]; } } // Tests for undefined (if w.length < 2) or 0. if (!w[2]) return 0; // @ts-expect-error: w[2] always number. i -= w[2]; } } // find_among_b is for backwards processing. Same comments apply /** * @param {Array>} v * @param {?function(): boolean} call_among_func */ find_among_b(v, call_among_func) { /** @protected */ let i = 0; let j = v.length const c = this.c; const lb = this.limit_backward; let common_i = 0; let common_j = 0; let first_key_inspected = false; while (true) { const k = i + ((j - i) >> 1); let diff = 0; let common = common_i < common_j ? common_i : common_j; const w = v[k]; let i2; // @ts-expect-error: w[0] always string. for (i2 = w[0].length - 1 - common; i2 >= 0; i2--) { if (c - common === lb) { diff = -1; break; } // @ts-expect-error: w[0] always string. diff = this.current.charCodeAt(c - 1 - common) - w[0].charCodeAt(i2); if (diff !== 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j === i) break; if (first_key_inspected) break; first_key_inspected = true; } } while (true) { const w = v[i]; // @ts-expect-error: w[0] always string. if (common_i >= w[0].length) { // @ts-expect-error: w[0] always string. this.c = c - w[0].length; if (w.length < 4) return w[1]; // @ts-expect-error: w[3] always number. this.af = w[3]; // @ts-expect-error: call_among_func never null here. if (call_among_func.call(this)) { // @ts-expect-error: w[0] always string. this.c = c - w[0].length; return w[1]; } } // Tests for undefined (if w.length < 2) or 0. if (!w[2]) return 0; // @ts-expect-error: w[2] always number. i -= w[2]; } } /* to replace chars between c_bra and c_ket in this.current by the * chars in s. */ /** * @param {number} c_bra * @param {number} c_ket * @param {string} s * @return {number} */ #replace_s(c_bra, c_ket, s) { const adjustment = s.length - (c_ket - c_bra); this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket); this.limit += adjustment; if (this.c >= c_ket) this.c += adjustment; else if (this.c > c_bra) this.c = c_bra; return adjustment; } /** */ #slice_check() { console.assert(this.bra >= 0); console.assert(this.bra <= this.ket); console.assert(this.ket <= this.limit); console.assert(this.limit <= this.current.length); } /** * @param {string} s */ slice_from(s) { /** @protected */ this.#slice_check(); this.#replace_s(this.bra, this.ket, s); this.ket = this.bra + s.length; } /** */ slice_del() { /** @protected */ this.slice_from(""); } /** * @param {number} c_bra * @param {number} c_ket * @param {string} s */ insert(c_bra, c_ket, s) { /** @protected */ const adjustment = this.#replace_s(c_bra, c_ket, s); if (c_bra <= this.bra) this.bra += adjustment; if (c_bra <= this.ket) this.ket += adjustment; } /** * @return {string} */ slice_to() { /** @protected */ this.#slice_check(); return this.current.slice(this.bra, this.ket); } } snowball-3.1.0/javascript/stemwords.js000066400000000000000000000067561520373054300201070ustar00rootroot00000000000000import fs from 'node:fs'; import process from 'node:process'; import readline from 'node:readline'; function usage() { console.log(`usage: stemwords.js [-l ] [-i ] [-o ] [-c ] [-h] The input file consists of a list of words to be stemmed, one per line. Words should be in lower case. Language defaults to "English", input to stdin, and output to stdout. If -c is given, the argument is the character encoding of the input and output files. If it is omitted, the UTF-8 encoding is used. The output file consists of the stemmed words, one per line. -h displays this help`); } { let input = ''; let output = ''; let encoding = 'utf8'; let language = 'English'; let usage_error = false; // Skip the first two entries of argv which are the interpreter // and the script name. // // deno doesn't allow modifying process.argv so we need to make // a copy here. const argv = process.argv.slice(2); while (argv.length > 0) { const arg = argv.shift(); switch (arg) { case "-h": usage(); process.exit(0); break; case "-l": if (argv.length === 0) { usage_error = true; break; } language = argv.shift(); break; case "-i": if (argv.length === 0) { usage_error = true; break; } input = argv.shift(); break; case "-o": if (argv.length === 0) { usage_error = true; break; } output = argv.shift(); break; case "-c": if (argv.length === 0) { usage_error = true; break; } encoding = argv.shift(); break; default: console.log('Unknown command line option: ' + arg + '\n'); usage_error = true; } if (usage_error) { usage(); process.exit(1); } } const stemmer = await create(language); let istream, ostream; if (input !== '') { istream = fs.createReadStream(input, encoding); } else { istream = process.stdin; if (istream.setEncoding) istream.setEncoding(encoding); } if (output !== '') { ostream = fs.createWriteStream(output, encoding); } else { ostream = process.stdout; if (ostream.setEncoding) ostream.setEncoding(encoding); } stemming(stemmer, istream, ostream); } // function stemming (stemmer : Stemmer, input : Stream, output : Stream) { function stemming (stemmer, input, output) { const lines = readline.createInterface({ input: input, terminal: false }); lines.on('line', (original) => { output.write(stemmer.stemWord(original) + '\n'); }); } async function create (name) { const lc_name = name.toLowerCase(); if (/\W/.test(lc_name) || lc_name === 'base') { console.log('Unknown stemming language: ' + name + '\n'); usage(); process.exit(1); return; } const filename = `../js_out/${lc_name}-stemmer.js`; try { // Load stemmer class from the module scope const stemmerModule = await import(filename); return new stemmerModule.default(); } catch (error) { console.error(error); } } snowball-3.1.0/libstemmer/000077500000000000000000000000001520373054300155015ustar00rootroot00000000000000snowball-3.1.0/libstemmer/libstemmer_c.in000066400000000000000000000043061520373054300205010ustar00rootroot00000000000000 #include #include #include "../include/libstemmer.h" #include "../runtime/api.h" #include "@MODULES_H@" struct sb_stemmer { struct SN_env * (*create)(void); void (*close)(struct SN_env *); int (*stem)(struct SN_env *); struct SN_env * env; }; extern const char ** sb_stemmer_list(void) { return algorithm_names; } static stemmer_encoding_t sb_getenc(const char * charenc) { const struct stemmer_encoding * encoding; if (charenc == NULL) return ENC_UTF_8; for (encoding = encodings; encoding->name != 0; encoding++) { if (strcmp(encoding->name, charenc) == 0) break; } if (encoding->name == NULL) return ENC_UNKNOWN; return encoding->enc; } extern struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc) { stemmer_encoding_t enc; const struct stemmer_modules * module; struct sb_stemmer * stemmer; enc = sb_getenc(charenc); if (enc == ENC_UNKNOWN) return NULL; for (module = modules; module->name != 0; module++) { if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; } if (module->name == NULL) return NULL; stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); if (stemmer == NULL) return NULL; stemmer->create = module->create; stemmer->close = module->close; stemmer->stem = module->stem; stemmer->env = stemmer->create(); if (stemmer->env == NULL) { sb_stemmer_delete(stemmer); return NULL; } return stemmer; } void sb_stemmer_delete(struct sb_stemmer * stemmer) { if (stemmer == 0) return; if (stemmer->close) { stemmer->close(stemmer->env); stemmer->close = 0; } free(stemmer); } const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) { int ret; if (SN_set_current(stemmer->env, size, (const symbol *)(word))) { stemmer->env->l = 0; return NULL; } ret = stemmer->stem(stemmer->env); if (ret < 0) return NULL; stemmer->env->p[stemmer->env->l] = 0; return (const sb_symbol *)(stemmer->env->p); } int sb_stemmer_length(struct sb_stemmer * stemmer) { return stemmer->env->l; } snowball-3.1.0/libstemmer/mkalgorithms.pl000077500000000000000000000035441520373054300205500ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use 5.006; use warnings; my $progname = $0; if (scalar @ARGV != 2) { print "Usage: $progname \n"; exit 1; } my $outname = shift(@ARGV); my $descfile = shift(@ARGV); my %aliases = (); my %algorithms = (); my %algorithm_encs = (); my %encs = (); sub addalgenc($$) { my $alg = shift(); my $enc = shift(); if (defined $algorithm_encs{$alg}) { my $hashref = $algorithm_encs{$alg}; $$hashref{$enc}=1; } else { my %newhash = ($enc => 1); $algorithm_encs{$alg}=\%newhash; } $encs{$enc} = 1; } sub readinput() { open DESCFILE, $descfile; my $line; while ($line = ) { next if $line =~ m/^\s*#/; next if $line =~ m/^\s*$/; my ($alg,$encstr,$aliases) = split(/\s+/, $line); my $enc; my $alias; $algorithms{$alg} = 1; foreach $alias (split(/,/, $aliases)) { foreach $enc (split(/,/, $encstr)) { $aliases{$alias} = $alg; addalgenc($alg, $enc); } } } } sub printoutput() { open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n"; print OUT <{$enc}; } print OUT "\n"; } } readinput(); printoutput(); snowball-3.1.0/libstemmer/mkmodules.pl000077500000000000000000000137371520373054300200540ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use 5.006; use warnings; my $progname = $0; if (scalar @ARGV < 4 || scalar @ARGV > 5) { print "Usage: $progname []\n"; exit 1; } my $outname = shift(@ARGV); my $c_src_dir = shift(@ARGV); my $descfile = shift(@ARGV); my $srclistfile = shift(@ARGV); my $enc_only; my $extn = ''; if (@ARGV) { $enc_only = shift(@ARGV); $extn = '_'.$enc_only; } my %aliases = (); my %algorithms = (); my %algorithm_encs = (); my %encs = (); sub addalgenc($$) { my $alg = shift(); my $enc = shift(); if (defined $enc_only) { my $norm_enc = lc $enc; $norm_enc =~ s/_//g; if ($norm_enc ne $enc_only) { return; } } if (defined $algorithm_encs{$alg}) { my $hashref = $algorithm_encs{$alg}; $$hashref{$enc}=1; } else { my %newhash = ($enc => 1); $algorithm_encs{$alg}=\%newhash; } $encs{$enc} = 1; } sub readinput() { open DESCFILE, $descfile; my $line; while ($line = ) { next if $line =~ m/^\s*#/; next if $line =~ m/^\s*$/; my ($alg,$encstr,$aliases) = split(/\s+/, $line); my $enc; my $alias; $algorithms{$alg} = 1; foreach $alias (split(/,/, $aliases)) { foreach $enc (split(/,/, $encstr)) { # print "$alias, $enc\n"; $aliases{$alias} = $alg; addalgenc($alg, $enc); } } } } sub printoutput() { open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n"; print OUT < 77) { print OUT ",\n * "; $linelen = 3; } else { print OUT ', '; $linelen += 2; } } print OUT $lang; $linelen += length($lang); $need_sep = 1; } print OUT "\n */\n\n"; foreach $lang (@algorithms) { my $hashref = $algorithm_encs{$lang}; foreach $enc (sort keys (%$hashref)) { print OUT "#include \"../$c_src_dir/stem_${enc}_$lang.h\"\n"; } } print OUT <$srclistfile") or die "Can't open output file `$srclistfile': $!\n"; print OUT < 77) { print OUT ",\n# "; $linelen = 3; } else { print OUT ', '; $linelen += 2; } } print OUT $lang; $linelen += length($lang); $need_sep = 1; } print OUT "\n\nsnowball_sources= \\\n"; for $lang (sort keys %aliases) { my $hashref = $algorithm_encs{$lang}; my $enc; foreach $enc (sort keys (%$hashref)) { print OUT " src_c/stem_${enc}_${lang}.c \\\n"; } } $need_sep = 0; for $srcfile ('runtime/api.c', 'runtime/utilities.c', "libstemmer/libstemmer${extn}.c") { print OUT " \\\n" if $need_sep; print OUT " $srcfile"; $need_sep = 1; } print OUT "\n\nsnowball_headers= \\\n"; for $lang (sort keys %aliases) { my $hashref = $algorithm_encs{$lang}; my $enc; foreach $enc (sort keys (%$hashref)) { my $p = "${lang}_${enc}"; print OUT " src_c/stem_${enc}_${lang}.h \\\n"; } } $need_sep = 0; for $srcfile ('include/libstemmer.h', "libstemmer/modules${extn}.h", 'runtime/api.h', 'runtime/snowball_runtime.h') { print OUT " \\\n" if $need_sep; print OUT " $srcfile"; $need_sep = 1; } print OUT "\n\n"; close OUT or die "Can't close ${srclistfile}: $!\n"; } readinput(); printoutput(); printsrclist(); snowball-3.1.0/libstemmer/modules.txt000066400000000000000000000073311520373054300177160ustar00rootroot00000000000000# This file contains a list of stemmers to include in the distribution. # The format is a set of space separated lines - on each line: # First item is name of stemmer. # Second item is comma separated list of character sets. # Third item is comma separated list of names to refer to the stemmer by. # # Lines starting with a #, or blank lines, are ignored. # List all the main algorithms for each language, in UTF-8, and also with # the most commonly used encoding. arabic UTF_8 arabic,ar,ara armenian UTF_8 armenian,hy,hye,arm basque UTF_8,ISO_8859_1 basque,eu,eus,baq catalan UTF_8,ISO_8859_1 catalan,ca,cat czech UTF_8,ISO_8859_2 czech,cs,ces,cze danish UTF_8,ISO_8859_1 danish,da,dan dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld,kraaij_pohlmann english UTF_8,ISO_8859_1 english,en,eng esperanto UTF_8 esperanto,eo,epo estonian UTF_8 estonian,et,est finnish UTF_8,ISO_8859_1 finnish,fi,fin french UTF_8,ISO_8859_1 french,fr,fre,fra german UTF_8,ISO_8859_1 german,de,ger,deu greek UTF_8 greek,el,gre,ell hindi UTF_8 hindi,hi,hin hungarian UTF_8,ISO_8859_2 hungarian,hu,hun indonesian UTF_8,ISO_8859_1 indonesian,id,ind irish UTF_8,ISO_8859_1 irish,ga,gle italian UTF_8,ISO_8859_1 italian,it,ita lithuanian UTF_8 lithuanian,lt,lit nepali UTF_8 nepali,ne,nep norwegian UTF_8,ISO_8859_1 norwegian,no,nor polish UTF_8,ISO_8859_2 polish,pl,pol persian UTF_8 persian,fa,fas,pers portuguese UTF_8,ISO_8859_1 portuguese,pt,por romanian UTF_8 romanian,ro,rum,ron russian UTF_8,KOI8_R russian,ru,rus serbian UTF_8 serbian,sr,srp spanish UTF_8,ISO_8859_1 spanish,es,esl,spa swedish UTF_8,ISO_8859_1 swedish,sv,swe tamil UTF_8 tamil,ta,tam turkish UTF_8 turkish,tr,tur yiddish UTF_8 yiddish,yi,yid sesotho UTF_8 sesotho,st,sot # Also include the traditional porter algorithm for english. # The porter algorithm is included in the libstemmer distribution to assist # with backwards compatibility, but for new systems the english algorithm # should be used in preference. porter UTF_8,ISO_8859_1 porter english # This is Martin Porter's Dutch stemmer. It was the default Dutch stemming # in Snowball 2.2.0 and earlier, but after user feedback and careful evaluation # we concluded that the Kraaij-Pohlmann Dutch stemmer was a better default. # We still provide this to help people who have a lot of existing data indexed # using it. dutch_porter UTF_8,ISO_8859_1 dutch_porter dutch # Some other stemmers in the snowball project are not included in the standard # distribution. To compile a libstemmer with them in, add them to this list, # and regenerate the distribution. (You will need a full source checkout for # this.) They are included in the snowball website as curiosities, but are not # intended for general use, and use of them is is not fully supported. These # algorithms are: # # lovins - This is an english stemmer, but fairly outdated, and # only really applicable to a restricted type of input text # (keywords in academic publications). #lovins UTF_8,ISO_8859_1 lovins english snowball-3.1.0/libstemmer/test.c000066400000000000000000000020041520373054300166200ustar00rootroot00000000000000 #include "libstemmer.h" /* test code */ void error(const char * err) { printf("%s\n", err); exit(1); } int main () { const char * stemmed; const char * unstemmed; struct sb_stemmer * s; const char ** list = sb_stemmer_list(); if (*list == 0) error("TEST FAIL: empty list of stemmers"); s = sb_stemmer_new("e"); if (s != 0) error("TEST FAIL: non zero return for unrecognised language"); s = sb_stemmer_new("english"); if (s == 0) error("TEST FAIL: zero return for recognised language"); sb_stemmer_delete(s); s = sb_stemmer_new("en"); if (s == 0) error("TEST FAIL: zero return for recognised language"); unstemmed = "recognised"; stemmed = sb_stemmer_stem(s, unstemmed, 10); printf("%s -> %s\n", unstemmed, stemmed); if (sb_stemmer_length(s) != strlen(stemmed)) error("TEST FAIL: length not correct"); unstemmed = "recognized"; printf("%s -> %s\n", unstemmed, stemmed); sb_stemmer_delete(s); printf("Success\n"); return 0; } snowball-3.1.0/pascal/000077500000000000000000000000001520373054300146015ustar00rootroot00000000000000snowball-3.1.0/pascal/.gitignore000066400000000000000000000000571520373054300165730ustar00rootroot00000000000000/*.ppu /*Stemmer.pas /stemwords.dpr /stemwords snowball-3.1.0/pascal/SnowballProgram.pas000066400000000000000000000271701520373054300204260ustar00rootroot00000000000000unit SnowballProgram; interface Type TAmongHandler = Function : Boolean of Object; Type TAmong = record Str : AnsiString; // search string Index : Integer; // index to longest matching substring Result : Integer; // result of the lookup Method : TAmongHandler; // method to use if substring matches End; Type {$M+} TSnowballProgram = Class Protected FCurrent : AnsiString; FCursor : Integer; FLimit : Integer; FBkLimit : Integer; FBra : Integer; FKet : Integer; Procedure SetCurrent(Current: AnsiString); Protected Function InGrouping(s : array of char; min, max : Integer) : Boolean; Function GoInGrouping(s : array of char; min, max : Integer) : Boolean; Function InGroupingBk(s : array of char; min, max : Integer) : Boolean; Function GoInGroupingBk(s : array of char; min, max : Integer) : Boolean; Function OutGrouping(s : array of char; min, max : Integer) : Boolean; Function GoOutGrouping(s : array of char; min, max : Integer) : Boolean; Function OutGroupingBk(s : array of char; min, max : Integer) : Boolean; Function GoOutGroupingBk(s : array of char; min, max : Integer) : Boolean; Function EqS(s : AnsiString) : Boolean; Function EqSBk(s : AnsiString) : Boolean; Function FindAmong(v : array of TAmong; v_size : Integer) : Integer; Function FindAmongBk(v : array of TAmong; v_size : Integer) : Integer; Procedure SliceDel; Procedure SliceCheck; Procedure SliceFrom(s : AnsiString); Function ReplaceS(bra, ket : Integer; s : AnsiString) : Integer; Procedure Insert(bra, ket : Integer; s : AnsiString); Function SliceTo : AnsiString; Function AssignTo : AnsiString; Public { Set & Retrieve current string } Property Current: AnsiString Read FCurrent Write SetCurrent; { Method subclasses need to implement } Function stem : Boolean; Virtual; Abstract; End; Implementation Uses Math; Procedure TSnowballProgram.SetCurrent(Current: AnsiString); Begin FCurrent := Current; FCursor := 0; FLimit := Length(Current); FBkLimit := 0; FBra := FCursor; FKet := FLimit; End; Function TSnowballProgram.InGrouping(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor >= FLimit) Then Exit; ch := Ord(FCurrent[FCursor + 1]); If (ch > max) Or (ch < min) Then Exit; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit; Inc(FCursor); Result := True; End; Function TSnowballProgram.GoInGrouping(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := True; While (FCursor < FLimit) Do Begin ch := Ord(FCurrent[FCursor + 1]); If (ch > max) Or (ch < min) Then Exit; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit; Inc(FCursor); End; Result := False; End; Function TSnowballProgram.InGroupingBk(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor <= FBkLimit) Then Exit; ch := Ord(FCurrent[FCursor]); If (ch > max) Or (ch < min) Then Exit; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit; Dec(FCursor); Result := True; End; Function TSnowballProgram.GoInGroupingBk(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := True; While (FCursor > FBkLimit) Do Begin ch := Ord(FCurrent[FCursor]); If (ch > max) Or (ch < min) Then Exit; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit; Dec(FCursor); End; Result := False; End; Function TSnowballProgram.OutGrouping(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor >= FLimit) Then Exit; ch := Ord(FCurrent[FCursor + 1]); If (ch > max) Or (ch < min) Then Begin Inc(FCursor); Result := True; Exit; End; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Begin Inc(FCursor); Result := True; End; End; Function TSnowballProgram.GoOutGrouping(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := True; While (FCursor < FLimit) Do Begin ch := Ord(FCurrent[FCursor + 1]); If (ch <= max) And (ch >= min) Then Begin ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) <> 0 Then Begin Exit; End; End; Inc(FCursor); End; Result := False; End; Function TSnowballProgram.OutGroupingBk(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor <= FBkLimit) Then Exit; ch := Ord(FCurrent[FCursor]); If (ch > max) Or (ch < min) Then Begin Dec(FCursor); Result := True; Exit; End; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Begin Dec(FCursor); Result := True; End; End; Function TSnowballProgram.GoOutGroupingBk(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := True; While (FCursor > FBkLimit) Do Begin ch := Ord(FCurrent[FCursor]); If (ch <= max) And (ch >= min) Then Begin ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) <> 0 Then Begin Exit; End; End; Dec(FCursor); End; Result := False; End; Function TSnowballProgram.EqS(s : AnsiString) : Boolean; Var I, s_size : Integer; Begin Result := False; s_size := Length(s); If (FLimit - FCursor) < s_size Then Exit; For I := 1 To s_size Do If FCurrent[FCursor + I] <> s[I] Then Exit; FCursor := FCursor + s_size; Result := True; End; Function TSnowballProgram.EqSBk(s : AnsiString) : Boolean; Var I, s_size : Integer; Begin Result := False; s_size := Length(s); if (FCursor - FBkLimit) < s_size Then Exit; For I := 1 To s_size Do If FCurrent[FCursor - s_size + I] <> s[i] Then Exit; FCursor := FCursor - s_size; Result := True; End; Function TSnowballProgram.FindAmong(v : array of TAmong; v_size : Integer) : Integer; Var i, i2, j, c, l, common_i, common_j, k, diff, common : Integer; first_key_inspected : Boolean; w : TAmong; Begin i := 0; j := v_size; c := FCursor; l := FLimit; common_i := 0; common_j := 0; first_key_inspected := false; While True Do Begin k := i + ((j - i) Shr 1); diff := 0; common := Min(common_i, common_j); // smaller w := v[k]; For i2 := common To Length(w.Str) - 1 Do Begin if (c + common) = l Then Begin diff := -1; Break; End; diff := Ord(FCurrent[c + common + 1]) - Ord(w.Str[i2 + 1]); if diff <> 0 Then Break; Inc(common); End; if diff < 0 Then Begin j := k; common_j := common; End Else Begin i := k; common_i := common; End; If (j - i) <= 1 Then Begin If (i > 0) Then Break; // v->s has been inspected if (j = i) Then Break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) Then Break; first_key_inspected := True; End; End; While True Do Begin w := v[i]; If (common_i >= Length(w.Str)) Then Begin FCursor := c + Length(w.Str); If Not Assigned(w.Method) Then Begin Result := w.Result; Exit; End; if w.Method Then Begin FCursor := c + Length(w.Str); Result := w.Result; Exit; End; End; i := w.Index; if i < 0 Then Begin Result := 0; Exit; End; End; End; Function TSnowballProgram.FindAmongBk(v : array of TAmong; v_size : Integer) : Integer; Var i, j, c, lb, common_i, common_j, k, diff, common, i2 : Integer; first_key_inspected : Boolean; w : TAmong; Begin i := 0; j := v_size; c := FCursor; lb := FBkLimit; common_i := 0; common_j := 0; first_key_inspected := false; While True Do Begin k := i + ((j - i) Shr 1); diff := 0; common := Min(common_i, common_j); w := v[k]; For i2 := Length(w.Str) - 1 - common DownTo 0 Do Begin If (c - common) = lb Then Begin diff := -1; Break; End; diff := Ord(FCurrent[c - common]) - Ord(w.Str[i2 + 1]); if diff <> 0 Then Break; Inc(common); End; If diff < 0 Then Begin j := k; common_j := common; End Else Begin i := k; common_i := common; End; If (j - i) <= 1 Then Begin if i > 0 Then Break; if j = i Then Break; if first_key_inspected Then Break; first_key_inspected := True; End; End; While True Do Begin w := v[i]; if common_i >= Length(w.Str) Then Begin FCursor := c - Length(w.Str); If Not Assigned(w.Method) Then Begin Result := w.Result; Exit; End; if w.Method Then Begin FCursor := c - Length(w.Str); Result := w.Result; Exit; End; End; i := w.Index; If i < 0 Then Begin Result := 0; Exit; End; End; End; Procedure TSnowballProgram.SliceCheck; Begin if (FBra < 0) Or (FBra > FKet) Or (FKet > FLimit) Or (FLimit > Length(FCurrent)) Then Begin WriteLn('Faulty slice operation.'); Halt; End; End; Procedure TSnowballProgram.SliceDel; Begin SliceFrom(''); End; Function TSnowballProgram.ReplaceS(bra, ket : Integer; s : AnsiString) : Integer; Var adjustment : Integer; Begin adjustment := Length(s) - (ket - bra); Delete(FCurrent, bra + 1, ket - bra); System.Insert(s, FCurrent, bra + 1); FLimit := FLimit + adjustment; if (FCursor >= ket) Then FCursor := FCursor + adjustment Else If (FCursor > bra) Then FCursor := bra; Result := adjustment; End; Procedure TSnowballProgram.Insert(bra, ket : Integer; s : AnsiString); Var adjustment : Integer; Begin adjustment := ReplaceS(bra, ket, s); If (bra <= FBra) Then FBra := FBra + adjustment; If (bra <= FKet) Then FKet := FKet + adjustment; End; Function TSnowballProgram.SliceTo() : AnsiString; Begin SliceCheck(); Result := Copy(FCurrent, FBra + 1, FKet - FBra); End; Procedure TSnowballProgram.SliceFrom(s : AnsiString); Begin SliceCheck(); ReplaceS(FBra, FKet, s); FKet := FBra + Length(s); End; Function TSnowballProgram.AssignTo() : AnsiString; Begin Result := Copy(FCurrent, 1, FLimit); End; End. snowball-3.1.0/pascal/generate.pl000077500000000000000000000010201520373054300167240ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; # Generate Pascal stemwords source. my @sources = @ARGV; while (defined(my $line = )) { if ($line =~ /\{\s*BEGIN TEMPLATE\s*\}/) { my $template = ''; while (defined($line = ) && $line !~ /\{\s*END TEMPLATE\s*\}/) { $template .= $line; } foreach my $source(@sources) { my $out = $template; $out =~ s/%STEMMER%/$source/g; print $out; } next; } print $line; } snowball-3.1.0/pascal/stemwords-template.dpr000066400000000000000000000026751520373054300211620ustar00rootroot00000000000000program stemwords; {$ifdef windows} {$APPTYPE CONSOLE} {$endif} uses SnowballProgram, { BEGIN TEMPLATE } %STEMMER%Stemmer in '%STEMMER%Stemmer.pas', { END TEMPLATE } SysUtils; Var Stemmer : TSnowballProgram; CurWord : AnsiString; i : Integer; language : AnsiString; Const Delimiters : Set Of Char = [#10, #13]; Function NextWord : Boolean; Var C : Char; Begin CurWord := ''; Result := Not Eof; While Not Eof Do Begin Read(C); If IOResult <> 0 Then Break; If C In Delimiters Then Break; CurWord := CurWord + C; End; End; begin language := 'english'; i := 0; while i < ParamCount do begin i := i + 1; if ParamStr(i) = '-l' then begin i := i + 1; language := ParamStr(i); continue; end; WriteLn('option '+ParamStr(i)+' unknown'); Exit; end; if False then { BEGIN TEMPLATE } else if language = '%STEMMER%' then Stemmer := T%STEMMER%Stemmer.Create { END TEMPLATE } else begin WriteLn('Stemming language '+language+' unknown'); Exit; end; Try While Not Eof Do Begin While NextWord Do Begin Stemmer.Current := CurWord; Stemmer.Stem; WriteLn(Stemmer.Current); End; End; Finally Stemmer.Free; End; end. snowball-3.1.0/php/000077500000000000000000000000001520373054300141255ustar00rootroot00000000000000snowball-3.1.0/php/base-stemmer.php000066400000000000000000000301141520373054300172210ustar00rootroot00000000000000current = $other->current; $this->cursor = $other->cursor; $this->limit = $other->limit; $this->limit_backward = $other->limit_backward; $this->bra = $other->bra; $this->ket = $other->ket; } /** * @param int[] $s */ protected function in_grouping(array $s): bool { if ($this->cursor >= $this->limit) { return false; } $ch = $this->charAt(); if (!array_key_exists($ch, $s)) { return false; } $this->cursor += strlen($ch); return true; } /** * @param int[] $s */ protected function go_in_grouping(array $s): bool { while ($this->cursor < $this->limit) { $ch = $this->charAt(); if (!array_key_exists($ch, $s)) { return true; } $this->cursor += strlen($ch); } return false; } /** * @param int[] $s */ protected function in_grouping_b(array $s): bool { if ($this->cursor <= $this->limit_backward) { return false; } $ch = $this->charBefore(); if (!array_key_exists($ch, $s)) { return false; } $this->cursor -= strlen($ch); return true; } /** * @param int[] $s */ protected function go_in_grouping_b(array $s): bool { while ($this->cursor > $this->limit_backward) { $ch = $this->charBefore(); if (!array_key_exists($ch, $s)) { return true; } $this->cursor -= strlen($ch); } return false; } /** * @param int[] $s */ protected function out_grouping(array $s): bool { if ($this->cursor >= $this->limit) { return false; } $ch = $this->charAt(); if (!array_key_exists($ch, $s)) { $this->cursor += strlen($ch); return true; } return false; } /** * @param int[] $s */ protected function go_out_grouping(array $s): bool { while ($this->cursor < $this->limit) { $ch = $this->charAt(); if (array_key_exists($ch, $s)) { return true; } $this->cursor += strlen($ch); } return false; } /** * @param int[] $s */ protected function out_grouping_b(array $s): bool { if ($this->cursor <= $this->limit_backward) { return false; } $ch = $this->charBefore(); if (!array_key_exists($ch, $s)) { $this->cursor -= strlen($ch); return true; } return false; } /** * @param int[] $s */ protected function go_out_grouping_b(array $s): bool { while ($this->cursor > $this->limit_backward) { $ch = $this->charBefore(); if (array_key_exists($ch, $s)) { return true; } $this->cursor -= strlen($ch); } return false; } protected function eq_s(string $s): bool { $slength = strlen($s); if ($this->limit - $this->cursor < $slength) { return false; } if (substr_compare($this->current, $s, $this->cursor, $slength) != 0) { return false; } $this->cursor += $slength; return true; } protected function eq_s_b(string $s): bool { $slength = strlen($s); if ($this->cursor - $this->limit_backward < $slength) { return false; } if (substr_compare($this->current, $s, $this->cursor - $slength, $slength) != 0) { return false; } $this->cursor -= $slength; return true; } /** * @param array[] $v */ protected function find_among(array $v): int { $i = 0; $j = count($v); $c = $this->cursor; $l = $this->limit; $common_i = 0; $common_j = 0; $first_key_inspected = false; while (true) { $k = $i + (($j-$i) >> 1); $diff = 0; $common = min($common_i, $common_j); // smaller // w[0]: string, w[1]: substring_i, w[2]: result, w[3]: function (optional) $w = $v[$k]; $w0length = strlen($w[0]); for ($i2 = $common; $i2 < $w0length; $i2++) { if ($c + $common === $l) { $diff = -1; break; } $diff = strcmp($this->current[$c+$common], $w[0][$i2]); if ($diff !== 0) { break; } $common++; } if ($diff < 0) { $j = $k; $common_j = $common; } else { $i = $k; $common_i = $common; } if ($j - $i <= 1) { if ($i > 0) { break; } // v->s has been inspected if ($j === $i) { break; } // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if ($first_key_inspected) { break; } $first_key_inspected = true; } } do { $w = $v[$i]; $w0length = strlen($w[0]); if ($common_i >= $w0length) { $this->cursor = $c + $w0length; if (count($w) < 4) { return $w[2]; } $res = $this->{$w[3]}(); $this->cursor = $c + $w0length; if ($res) { return $w[2]; } } $i = $w[1]; } while ($i >= 0); return 0; } /** * find_among_b is for backwards processing. Same comments apply */ protected function find_among_b(array $v): int { $i = 0; $j = count($v); $c = $this->cursor; $lb = $this->limit_backward; $common_i = 0; $common_j = 0; $first_key_inspected = false; while (true) { $k = $i + (($j-$i) >> 1); $diff = 0; $common = min($common_i, $common_j); $w = $v[$k]; $w0length = strlen($w[0]); for ($i2 = $w0length - 1 - $common; $i2 >= 0; $i2--) { if ($c - $common == $lb) { $diff = -1; break; } $diff = strcmp($this->current[$c - 1 - $common], $w[0][$i2]); if ($diff != 0) { break; } $common++; } if ($diff < 0) { $j = $k; $common_j = $common; } else { $i = $k; $common_i = $common; } if ($j - $i <= 1) { if ($i > 0 || $j === $i || $first_key_inspected) { break; } $first_key_inspected = true; } } do { $w = $v[$i]; $w0length = strlen($w[0]); if ($common_i >= $w0length) { $this->cursor = $c - $w0length; if (count($w) < 4) { return $w[2]; } $res = $this->{$w[3]}(); $this->cursor = $c - $w0length; if ($res) { return $w[2]; } } $i = $w[1]; } while ($i >= 0); return 0; } /** * to replace chars between $c_bra and $c_ket in $this->current by the chars in $s. */ private function replace_s(int $c_bra, int $c_ket, string $s): int { $slength = strlen($s); $adjustment = $slength - ($c_ket - $c_bra); $this->current = substr_replace($this->current, $s, $c_bra, $c_ket - $c_bra); $this->limit += $adjustment; if ($this->cursor >= $c_ket) { $this->cursor += $adjustment; } elseif ($this->cursor > $c_bra) { $this->cursor = $c_bra; } return $adjustment; } private function slice_check(): void { if ( $this->bra < 0 || $this->bra > $this->ket || $this->ket > $this->limit || $this->limit > strlen($this->current) ) { throw new LogicException('Faulty slice operation'); } } protected function slice_from(string $s): void { $this->slice_check(); $this->replace_s($this->bra, $this->ket, $s); $this->ket = $this->bra + strlen($s); } protected function slice_del(): void { $this->slice_from(''); } protected function insert(int $c_bra, int $c_ket, string $s): void { $adjustment = $this->replace_s($c_bra, $c_ket, $s); $c_bra <= $this->bra and $this->bra += $adjustment; $c_bra <= $this->ket and $this->ket += $adjustment; } protected function slice_to(): string { $this->slice_check(); return substr($this->current, $this->bra, $this->ket - $this->bra); } private function charAt(): string { $s = $this->current[$this->cursor]; $c = ord($s); if ($c < 0xc0) return $s; if ($c < 0xe0) return substr($this->current, $this->cursor, 2); if ($c < 0xf0) return substr($this->current, $this->cursor, 3); return substr($this->current, $this->cursor, 4); } private function charBefore(): string { $s = $this->current[$this->cursor - 1]; if (ord($s) < 0x80) return $s; $o = $this->cursor - 1; while (--$o && ord($this->current[$o]) < 0xc0) { } return substr($this->current, $o, $this->cursor - $o); } protected function inc_cursor(): void { do ++$this->cursor; while ($this->cursor < $this->limit && (ord($this->current[$this->cursor]) & 0xc0) == 0x80); } protected function dec_cursor(): void { do --$this->cursor; while ($this->cursor > $this->limit_backward && (ord($this->current[$this->cursor]) & 0xc0) == 0x80); } protected function hop(int $delta): bool { $res = $this->cursor; while ($delta > 0) { $delta--; if ($res >= $this->limit) { return false; } do { $res++; } while ($res < $this->limit && (ord($this->current[$res]) & 0xc0) == 0x80); } $this->cursor = $res; return true; } protected function hop_checked(int $delta): bool { return $delta >= 0 && $this->hop($delta); } protected function hop_back(int $delta): bool { $res = $this->cursor; while ($delta > 0) { $delta--; if ($res <= $this->limit_backward) { return false; } do { $res--; } while ($res > $this->limit_backward && (ord($this->current[$res]) & 0xc0) == 0x80); } $this->cursor = $res; return true; } protected function hop_back_checked(int $delta): bool { return $delta >= 0 && $this->hop_back($delta); } /** * Public entry point for stemming a word */ public function stemWord(string $word): string { $this->current = $word; $this->cursor = 0; $this->limit = strlen($word); $this->limit_backward = 0; $this->bra = $this->cursor; $this->ket = $this->limit; $this->stem(); return $this->current; } } snowball-3.1.0/php/stemwords.php000066400000000000000000000011761520373054300166720ustar00rootroot00000000000000stemWord($word); echo $stem, "\n"; } snowball-3.1.0/python/000077500000000000000000000000001520373054300146575ustar00rootroot00000000000000snowball-3.1.0/python/MANIFEST.in000066400000000000000000000001761520373054300164210ustar00rootroot00000000000000include *.rst include modules.txt include setup.* recursive-include src *.py include MANIFEST.in include COPYING include NEWS snowball-3.1.0/python/create_init.py000066400000000000000000000024621520373054300175230ustar00rootroot00000000000000#! /bin/sh/env python import sys import re import os python_out_folder = sys.argv[1] filematch = re.compile(r"(\w+)_stemmer\.py$") imports = [] languages = [] for pyscript in os.listdir(python_out_folder): match = filematch.match(pyscript) if (match): langname = match.group(1) titlecase = re.sub(r"_", "", langname.title()) languages.append(" '%(lang)s': %(title)sStemmer," % {'lang': langname, 'title': titlecase}) imports.append(' from .%(lang)s_stemmer import %(title)sStemmer' % {'lang': langname, 'title': titlecase}) imports.sort() languages.sort() if len(languages) == 0: raise AssertionError('languages list is empty!') src = '''__all__ = ('language', 'stemmer') try: import Stemmer algorithms = Stemmer.algorithms stemmer = Stemmer.Stemmer except ImportError: %(imports)s _languages = { %(languages)s } def algorithms(): return list(_languages.keys()) def stemmer(lang): lang = lang.lower() if lang in _languages: return _languages[lang]() else: raise KeyError("Stemming algorithm '%%s' not found" %% lang) ''' % {'imports': '\n'.join(imports), 'languages': '\n'.join(languages)} with open(os.path.join(python_out_folder, '__init__.py'), 'w') as out: out.write(src) snowball-3.1.0/python/pyproject.toml000066400000000000000000000001211520373054300175650ustar00rootroot00000000000000[build-system] requires = ["setuptools"] build-backend = "setuptools.build_meta" snowball-3.1.0/python/setup.cfg000066400000000000000000000001321520373054300164740ustar00rootroot00000000000000[metadata] long_description = file: README.rst long_description_content_type = text/x-rst snowball-3.1.0/python/setup.py000066400000000000000000000052371520373054300164000ustar00rootroot00000000000000#!/usr/bin/env python from setuptools import setup import re SNOWBALL_VERSION = '3.1.0' n_stemmers = 0 langs = [] variants = {} with open('modules.txt') as fp: for line in fp.readlines(): if len(line) <= 1 or line[0] == '#': continue if line[-1:] == '\n': line = line[:-1] tokens = re.split(r'\s+', line) if len(tokens) < 3: print("Bad modules.txt line: " + line) continue (name, encs, codes) = tokens[:3] if len(tokens) > 3: variant_of = tokens[3] if variant_of in variants: variants[variant_of].append(name) else: variants[variant_of] = [name] else: langs.append(name) n_stemmers += 1 desc = 'This package provides ' + str(n_stemmers) + ' stemmers for ' + \ str(len(langs)) + ' languages generated from Snowball algorithms.' classifiers = [ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', ] for lang in langs: lang_titlecase = lang.title() # Only classifiers listed in https://pypi.org/classifiers/ are allowed # Remove them here or submit them to https://github.com/pypa/trove-classifiers classifiers.append('Natural Language :: ' + lang_titlecase) classifiers.extend([ 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', 'Programming Language :: Python :: 3.13', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Database', 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', 'Topic :: Text Processing :: Indexing', 'Topic :: Text Processing :: Linguistic' ]) setup(name='snowballstemmer', version=SNOWBALL_VERSION, description=desc, author='Snowball Developers', author_email='snowball-discuss@lists.tartarus.org', url='https://github.com/snowballstem/snowball', keywords="stemmer", license="BSD-3-Clause", packages=['snowballstemmer'], package_dir={"snowballstemmer": "src/snowballstemmer"}, python_requires='>=3.3', classifiers = classifiers ) snowball-3.1.0/python/snowballstemmer/000077500000000000000000000000001520373054300200755ustar00rootroot00000000000000snowball-3.1.0/python/snowballstemmer/among.py000066400000000000000000000006131520373054300215500ustar00rootroot00000000000000class Among: def __init__(self, s, substring_i, result, method=None): """ @ivar s search string @ivar substring index to longest matching substring @ivar result of the lookup @ivar method method to use if substring matches """ self.s = s self.substring_i = substring_i self.result = result self.method = method snowball-3.1.0/python/snowballstemmer/basestemmer.py000066400000000000000000000174141520373054300227650ustar00rootroot00000000000000class BaseStemmer: def __init__(self): self.set_current("") def set_current(self, value): ''' Set the self.current string. ''' self.current = value self.cursor = 0 self.limit = len(self.current) self.limit_backward = 0 self.bra = self.cursor self.ket = self.limit def get_current(self): ''' Get the self.current string. ''' return self.current def copy_from(self, other): self.current = other.current self.cursor = other.cursor self.limit = other.limit self.limit_backward = other.limit_backward self.bra = other.bra self.ket = other.ket def in_grouping(self, s): if self.cursor >= self.limit: return False if self.current[self.cursor] not in s: return False self.cursor += 1 return True def go_in_grouping(self, s): while self.cursor < self.limit: if self.current[self.cursor] not in s: return True self.cursor += 1 return False def in_grouping_b(self, s): if self.cursor <= self.limit_backward: return False if self.current[self.cursor - 1] not in s: return False self.cursor -= 1 return True def go_in_grouping_b(self, s): while self.cursor > self.limit_backward: if self.current[self.cursor - 1] not in s: return True self.cursor -= 1 return False def out_grouping(self, s): if self.cursor >= self.limit: return False if self.current[self.cursor] not in s: self.cursor += 1 return True return False def go_out_grouping(self, s): while self.cursor < self.limit: if self.current[self.cursor] in s: return True self.cursor += 1 return False def out_grouping_b(self, s): if self.cursor <= self.limit_backward: return False if self.current[self.cursor - 1] not in s: self.cursor -= 1 return True return False def go_out_grouping_b(self, s): while self.cursor > self.limit_backward: if self.current[self.cursor - 1] in s: return True self.cursor -= 1 return False def eq_s(self, s): if self.current.startswith(s, self.cursor, self.limit): self.cursor += len(s) return True return False def eq_s_b(self, s): if self.current.endswith(s, self.limit_backward, self.cursor): self.cursor -= len(s) return True return False def find_among(self, v): i = 0 j = len(v) c = self.cursor l = self.limit common_i = 0 common_j = 0 first_key_inspected = False while True: k = i + ((j - i) >> 1) diff = 0 common = min(common_i, common_j) # smaller w = v[k] for i2 in range(common, len(w.s)): if c + common == l: diff = -1 break diff = ord(self.current[c + common]) - ord(w.s[i2]) if diff != 0: break common += 1 if diff < 0: j = k common_j = common else: i = k common_i = common if j - i <= 1: if i > 0: break # v->s has been inspected if j == i: break # only one item in v # - but now we need to go round once more to get # v->s inspected. This looks messy, but is actually # the optimal approach. if first_key_inspected: break first_key_inspected = True while True: w = v[i] if common_i >= len(w.s): self.cursor = c + len(w.s) if w.method is None: return w.result if w.method(self): self.cursor = c + len(w.s) return w.result i = w.substring_i if i < 0: return 0 return -1 # not reachable def find_among_b(self, v): ''' find_among_b is for backwards processing. Same comments apply ''' i = 0 j = len(v) c = self.cursor lb = self.limit_backward common_i = 0 common_j = 0 first_key_inspected = False while True: k = i + ((j - i) >> 1) diff = 0 common = min(common_i, common_j) w = v[k] for i2 in range(len(w.s) - 1 - common, -1, -1): if c - common == lb: diff = -1 break diff = ord(self.current[c - 1 - common]) - ord(w.s[i2]) if diff != 0: break common += 1 if diff < 0: j = k common_j = common else: i = k common_i = common if j - i <= 1: if i > 0: break if j == i: break if first_key_inspected: break first_key_inspected = True while True: w = v[i] if common_i >= len(w.s): self.cursor = c - len(w.s) if w.method is None: return w.result if w.method(self): self.cursor = c - len(w.s) return w.result i = w.substring_i if i < 0: return 0 return -1 # not reachable def replace_s(self, c_bra, c_ket, s): ''' to replace chars between c_bra and c_ket in self.current by the chars in s. @type c_bra int @type c_ket int @type s: string ''' adjustment = len(s) - (c_ket - c_bra) self.current = self.current[0:c_bra] + s + self.current[c_ket:] self.limit += adjustment if self.cursor >= c_ket: self.cursor += adjustment elif self.cursor > c_bra: self.cursor = c_bra return adjustment def slice_from(self, s): ''' @type s string ''' assert self.bra >= 0 assert self.bra <= self.ket assert self.ket <= self.limit assert self.limit <= len(self.current) self.replace_s(self.bra, self.ket, s) self.ket = self.bra + len(s) def slice_del(self): return self.slice_from("") def insert(self, c_bra, c_ket, s): ''' @type c_bra int @type c_ket int @type s: string ''' adjustment = self.replace_s(c_bra, c_ket, s) if c_bra <= self.bra: self.bra += adjustment if c_bra <= self.ket: self.ket += adjustment def slice_to(self): ''' Return the slice as a string. ''' assert self.bra >= 0 assert self.bra <= self.ket assert self.ket <= self.limit assert self.limit <= len(self.current) return self.current[self.bra:self.ket] def assign_to(self): ''' Return the current string up to the limit. ''' return self.current[0:self.limit] def stemWord(self, word): self.set_current(word) self._stem() return self.get_current() def stemWords(self, words): return [self.stemWord(word) for word in words] snowball-3.1.0/python/stemwords.py000066400000000000000000000065451520373054300172720ustar00rootroot00000000000000import sys import snowballstemmer def usage(): print('''usage: %s [-l ] [-i ] [-o ] [-c ] [-p[2]] [-h] The input file consists of a list of words to be stemmed, one per line. Words should be in lower case, but (for English) A-Z letters are mapped to their a-z equivalents anyway. If omitted, stdin is used. If -c is given, the argument is the character encoding of the input and output files. If it is omitted, the UTF-8 encoding is used. If -p is given the output file consists of each word of the input file followed by \"->\" followed by its stemmed equivalent. If -p2 is given the output file is a two column layout containing the input words in the first column and the stemmed equivalents in the second column. Otherwise, the output file consists of the stemmed words, one per line. -h displays this help''' % sys.argv[0]) def main(): pretty = 0 input = '' output = '' encoding = 'utf_8' language = 'English' show_help = False argv = sys.argv[1:] while len(argv): arg = argv.pop(0) if arg == '-h': show_help = True break elif arg == "-p": pretty = 1 elif arg == "-p2": pretty = 2 elif arg == "-l": if len(argv) == 0: show_help = True break language = argv.pop(0) elif arg == "-i": if len(argv) == 0: show_help = True break input = argv.pop(0) elif arg == "-o": if len(argv) == 0: show_help = True break output = argv.pop(0) elif arg == "-c": if len(argv) == 0: show_help = True break encoding = argv.pop(0) if show_help: usage() else: stemmer = snowballstemmer.stemmer(language) if input != '': infile = open(input, "r", encoding=encoding) else: infile = sys.stdin # reconfigure() requires Python 3.7 so check existing encoding. if infile.encoding.lower() != encoding.lower(): infile.reconfigure(encoding = encoding) if output != '': outfile = open(output, "w", encoding=encoding) else: outfile = sys.stdout if outfile.encoding.lower() != encoding.lower(): outfile.reconfigure(encoding = encoding) stemming(stemmer, infile, outfile, pretty) outfile.close() infile.close() def stemming(stemmer, infile, outfile, pretty): for original in infile.readlines(): original = original.strip() # Convert only ASCII-letters to lowercase, to match C behavior original = ''.join(c.lower() if 'A' <= c <= 'Z' else c for c in original) stemmed = stemmer.stemWord(original) if pretty == 0: if stemmed != "": outfile.write(stemmed) elif pretty == 1: outfile.write(original, " -> ", stemmed) elif pretty == 2: outfile.write(original) if len(original) < 30: outfile.write(" " * (30 - len(original))) else: outfile.write("\n") outfile.write(" " * 30) outfile.write(stemmed) outfile.write('\n') main() snowball-3.1.0/python/testapp.py000066400000000000000000000012011520373054300167030ustar00rootroot00000000000000import sys import re import snowballstemmer def usage(): print("testapp.py \"sentence\"...") def main(): argv = sys.argv if len(argv) < 1: usage() return algorithm = 'english' if len(argv) > 2: algorithm = argv[1] argv = argv[2:] else: argv = argv[1:] stemmer = snowballstemmer.stemmer(algorithm) splitter = re.compile(r"[\s\.-]") for arg in argv: for word in splitter.split(arg): if word == '': continue original = word.lower() print(original + " -> " + stemmer.stemWord(original)) main() snowball-3.1.0/runtime/000077500000000000000000000000001520373054300150215ustar00rootroot00000000000000snowball-3.1.0/runtime/api.c000066400000000000000000000012461520373054300157410ustar00rootroot00000000000000 #include /* for malloc, free */ #include "snowball_runtime.h" static const struct SN_env default_SN_env; extern struct SN_env * SN_new_env(int alloc_size) { struct SN_env * z = (struct SN_env *) malloc(alloc_size); if (z == NULL) return NULL; *z = default_SN_env; z->p = create_s(); if (z->p == NULL) { SN_delete_env(z); return NULL; } return z; } extern void SN_delete_env(struct SN_env * z) { if (z == NULL) return; if (z->p) lose_s(z->p); free(z); } extern int SN_set_current(struct SN_env * z, int size, const symbol * s) { int err = replace_s(z, 0, z->l, size, s); z->c = 0; return err; } snowball-3.1.0/runtime/api.h000066400000000000000000000015041520373054300157430ustar00rootroot00000000000000#ifndef SNOWBALL_API_H_INCLUDED #define SNOWBALL_API_H_INCLUDED typedef unsigned char symbol; /* Or replace 'char' above with 'short' for 16 bit characters. More precisely, replace 'char' with whatever type guarantees the character width you need. Note however that sizeof(symbol) should divide HEAD, defined in snowball_runtime.h as 2*sizeof(int), without remainder, otherwise there is an alignment problem. In the unlikely event of a problem here, consult Martin Porter. */ struct SN_env { symbol * p; int c; int l; int lb; int bra; int ket; int af; }; #ifdef __cplusplus extern "C" { #endif extern struct SN_env * SN_new_env(int alloc_size); extern void SN_delete_env(struct SN_env * z); extern int SN_set_current(struct SN_env * z, int size, const symbol * s); #ifdef __cplusplus } #endif #endif snowball-3.1.0/runtime/snowball_runtime.h000066400000000000000000000073501520373054300205630ustar00rootroot00000000000000#ifndef SNOWBALL_INCLUDED_SNOWBALL_RUNTIME_H #define SNOWBALL_INCLUDED_SNOWBALL_RUNTIME_H #include "api.h" #define HEAD 2*sizeof(int) #ifdef __cplusplus /* Use reinterpret_cast<> to avoid -Wcast-align warnings from clang++. */ # define SIZE(p) (reinterpret_cast(p))[-1] # define SET_SIZE(p, n) (reinterpret_cast(p))[-1] = n # define CAPACITY(p) (reinterpret_cast(p))[-2] #else # define SIZE(p) ((const int *)(p))[-1] # define SET_SIZE(p, n) ((int *)(p))[-1] = n # define CAPACITY(p) ((int *)(p))[-2] #endif #ifdef SNOWBALL_RUNTIME_THROW_EXCEPTIONS # define SNOWBALL_ERR void #else # define SNOWBALL_ERR int #endif #ifdef SNOWBALL_DEBUG_COMMAND_USED # include static void debug(struct SN_env * z, int n, int line) { int i; int len = SIZE(z->p); printf("%3d (line %4d): [%d]'", n, line, len); for (i = 0; i <= len; i++) { if (z->lb == i) printf("{"); if (z->bra == i) printf("["); if (z->c == i) printf("|"); if (z->ket == i) printf("]"); if (z->l == i) printf("}"); if (i < len) { int ch = z->p[i]; if (ch == 0) ch = '#'; printf("%c", ch); } } printf("'\n"); } #endif struct among { /* Number of symbols in s. */ int s_size; /* Search string. */ const symbol * s; /* Delta of index to longest matching substring, or 0 if none. */ int substring_i; /* Result of the lookup. */ int result; /* Optional condition routine index, or 0 if none. */ int function; }; #ifdef __cplusplus extern "C" { #endif extern symbol * create_s(void); extern void lose_s(symbol * p); extern int skip_utf8(const symbol * p, int c, int limit, int n); extern int skip_b_utf8(const symbol * p, int c, int limit, int n); extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int eq_s(struct SN_env * z, int s_size, const symbol * s); extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s); extern int eq_v(struct SN_env * z, const symbol * p); extern int eq_v_b(struct SN_env * z, const symbol * p); extern int find_among(struct SN_env * z, const struct among * v, int v_size, int (*)(struct SN_env *)); extern int find_among_b(struct SN_env * z, const struct among * v, int v_size, int (*)(struct SN_env *)); extern SNOWBALL_ERR replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s); extern SNOWBALL_ERR slice_from_s(struct SN_env * z, int s_size, const symbol * s); extern SNOWBALL_ERR slice_from_v(struct SN_env * z, const symbol * p); extern SNOWBALL_ERR slice_del(struct SN_env * z); extern SNOWBALL_ERR insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s); extern SNOWBALL_ERR insert_v(struct SN_env * z, int bra, int ket, const symbol * p); extern SNOWBALL_ERR slice_to(struct SN_env * z, symbol ** p); extern SNOWBALL_ERR assign_to(struct SN_env * z, symbol ** p); extern int len_utf8(const symbol * p); #ifdef __cplusplus } #endif #endif snowball-3.1.0/runtime/utilities.c000066400000000000000000000503571520373054300172120ustar00rootroot00000000000000 #include #include #include #include "snowball_runtime.h" #ifdef SNOWBALL_RUNTIME_THROW_EXCEPTIONS # include # include # define SNOWBALL_RETURN_OK return # define SNOWBALL_RETURN_OR_THROW(R, E) throw E # define SNOWBALL_PROPAGATE_ERR(F) F #else # define SNOWBALL_RETURN_OK return 0 # define SNOWBALL_RETURN_OR_THROW(R, E) return R # define SNOWBALL_PROPAGATE_ERR(F) do { \ int snowball_err = F; \ if (snowball_err < 0) return snowball_err; \ } while (0) #endif #define CREATE_SIZE 1 extern symbol * create_s(void) { symbol * p; void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol)); if (mem == NULL) SNOWBALL_RETURN_OR_THROW(NULL, std::bad_alloc()); p = (symbol *) (HEAD + (char *) mem); CAPACITY(p) = CREATE_SIZE; SET_SIZE(p, 0); return p; } extern void lose_s(symbol * p) { if (p == NULL) return; free((char *) p - HEAD); } /* new_p = skip_utf8(p, c, l, n); skips n characters forwards from p + c. new_p is the new position, or -1 on failure (if c would be > l). Caller ensures n >= 0. -- used to implement hop and next in the utf8 case. */ extern int skip_utf8(const symbol * p, int c, int limit, int n) { int b; for (; n > 0; n--) { if (c >= limit) return -1; b = p[c++]; if (b >= 0xC0) { /* 1100 0000 */ while (c < limit) { b = p[c]; if (b >= 0xC0 || b < 0x80) break; /* break unless b is 10------ */ c++; } } } return c; } /* new_p = skip_b_utf8(p, c, lb, n); skips n characters backwards from p + c - 1 new_p is the new position, or -1 on failure (if c would be < lb). Caller ensures n >= 0. -- used to implement hop and next in the utf8 case. */ extern int skip_b_utf8(const symbol * p, int c, int limit, int n) { int b; for (; n > 0; n--) { if (c <= limit) return -1; b = p[--c]; if (b >= 0x80) { /* 1000 0000 */ while (c > limit) { b = p[c]; if (b >= 0xC0) break; /* 1100 0000 */ c--; } } } return c; } /* Code for character groupings: utf8 cases */ static int get_utf8(const symbol * p, int c, int l, int * slot) { int b0, b1, b2; if (c >= l) return 0; b0 = p[c++]; if (b0 < 0xC0 || c == l) { /* 1100 0000 */ *slot = b0; return 1; } b1 = p[c++] & 0x3F; if (b0 < 0xE0 || c == l) { /* 1110 0000 */ *slot = (b0 & 0x1F) << 6 | b1; return 2; } b2 = p[c++] & 0x3F; if (b0 < 0xF0 || c == l) { /* 1111 0000 */ *slot = (b0 & 0xF) << 12 | b1 << 6 | b2; return 3; } *slot = (b0 & 0x7) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F); return 4; } static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { int a, b; if (c <= lb) return 0; b = p[--c]; if (b < 0x80 || c == lb) { /* 1000 0000 */ *slot = b; return 1; } a = b & 0x3F; b = p[--c]; if (b >= 0xC0 || c == lb) { /* 1100 0000 */ *slot = (b & 0x1F) << 6 | a; return 2; } a |= (b & 0x3F) << 6; b = p[--c]; if (b >= 0xE0 || c == lb) { /* 1110 0000 */ *slot = (b & 0xF) << 12 | a; return 3; } *slot = (p[--c] & 0x7) << 18 | (b & 0x3F) << 12 | a; return 4; } #ifdef SNOWBALL_COVERAGE /* The grouping number gets stored in a byte, clamped to 255. */ static char grouping_seen[255]; static void report_coverage(const unsigned char * s, int min, int max, int ch, const unsigned char * p, int w) { int i = 0; int j; int outof = 0; const unsigned char * loc = s + (max - min + 8) / 8; int grouping_number = *loc++; /* Adjust ch be an offset from min if it's past the end of the range. If * we already subtracted min then this will condition will be false. Only * needed for the "out" case but the condition can never be true for the * "in" case. */ if (ch > max) ch -= min; /* Find the index of this character in the grouping. */ for (j = 0; j != max - min; ++j) { if (s[j >> 3] & (0X1 << (j & 0X7))) { ++outof; if (j < ch) ++i; } } if (grouping_number < (int)sizeof(grouping_seen) && grouping_seen[grouping_number] == 0) { /* Report every entry once, then unused cases will appear (and we can * decrement each count when generating the coverage report). */ int k = 0; for (j = 0; j != max - min; ++j) { if (s[j >> 3] & (0X1 << (j & 0X7))) { fprintf(stderr, "%s index %d of %d '", loc, k, outof + 1); int codepoint = j + min; if (codepoint < 0x80) { putc(codepoint, stderr); } else if (codepoint < 0x800) { putc((codepoint >> 6) | 0xC0, stderr); putc((codepoint & 0x3F) | 0x80, stderr); } else { putc((codepoint >> 12) | 0xE0, stderr); putc(((codepoint >> 6) & 0x3F) | 0x80, stderr); putc((codepoint & 0x3F) | 0x80, stderr); } fprintf(stderr, "'\n"); ++k; } } grouping_seen[grouping_number] = 1; } fprintf(stderr, "%s index %d of %d '%.*s'\n", loc, i, outof + 1, w, p); } static void report_coverage_nomatch(const unsigned char * s, int min, int max) { const unsigned char * loc = s + (max - min + 8) / 8; ++loc; fprintf(stderr, "%s no match\n", loc); } #endif extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_utf8(z->p, z->c, z->l, & ch); if (!w) return -1; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { #ifdef SNOWBALL_COVERAGE report_coverage_nomatch(s, min, max); #endif return w; } #ifdef SNOWBALL_COVERAGE report_coverage(s, min, max, ch, z->p + z->c, w); #endif z->c += w; } while (repeat); return 0; } extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_b_utf8(z->p, z->c, z->lb, & ch); if (!w) return -1; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { #ifdef SNOWBALL_COVERAGE report_coverage_nomatch(s, min, max); #endif return w; } #ifdef SNOWBALL_COVERAGE report_coverage(s, min, max, ch, z->p + z->c - w, w); #endif z->c -= w; } while (repeat); return 0; } extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_utf8(z->p, z->c, z->l, & ch); if (!w) return -1; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) { #ifdef SNOWBALL_COVERAGE report_coverage(s, min, max, ch, z->p + z->c, w); #endif return w; } #ifdef SNOWBALL_COVERAGE report_coverage_nomatch(s, min, max); #endif z->c += w; } while (repeat); return 0; } extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_b_utf8(z->p, z->c, z->lb, & ch); if (!w) return -1; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) { #ifdef SNOWBALL_COVERAGE report_coverage(s, min, max, ch, z->p + z->c - w, w); #endif return w; } #ifdef SNOWBALL_COVERAGE report_coverage_nomatch(s, min, max); #endif z->c -= w; } while (repeat); return 0; } /* Code for character groupings: non-utf8 cases */ extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c >= z->l) return -1; ch = z->p[z->c]; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 1; z->c++; } while (repeat); return 0; } extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c <= z->lb) return -1; ch = z->p[z->c - 1]; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 1; z->c--; } while (repeat); return 0; } extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c >= z->l) return -1; ch = z->p[z->c]; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) return 1; z->c++; } while (repeat); return 0; } extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c <= z->lb) return -1; ch = z->p[z->c - 1]; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) return 1; z->c--; } while (repeat); return 0; } extern int eq_s(struct SN_env * z, int s_size, const symbol * s) { if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0; z->c += s_size; return 1; } extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) { if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0; z->c -= s_size; return 1; } extern int eq_v(struct SN_env * z, const symbol * p) { return eq_s(z, SIZE(p), p); } extern int eq_v_b(struct SN_env * z, const symbol * p) { return eq_s_b(z, SIZE(p), p); } #ifdef SNOWBALL_COVERAGE /* Declare more entries than any real Snowball program will have. */ static char among_seen[4096]; #endif extern int find_among(struct SN_env * z, const struct among * v, int v_size, int (*call_among_func)(struct SN_env*)) { int i = 0; int j = v_size; int c = z->c; int l = z->l; const symbol * q = z->p + c; const struct among * w; int common_i = 0; int common_j = 0; int first_key_inspected = 0; #ifdef SNOWBALL_COVERAGE int among_number = v[v_size].s_size; if (among_number < (int)sizeof(among_seen) && among_seen[among_number] == 0) { /* Report every entry once, then unused cases will appear (and we can * decrement each count when generating the coverage report). */ int k; for (k = 0; k < v_size; ++k) { w = v + k; fprintf(stderr, "%s: among %d : %d of %d string '%.*s'\n", w[v_size].s, among_number, w[v_size].result, v_size, w->s_size, w->s); } /* If the among matches the empty string without a gating function then * the "no match" case is impossible and so not useful to include in a * coverage report. */ if (v[v_size * 2].s_size != -1) { fprintf(stderr, "%s: among %d no match\n", v[v_size * 2].s, among_number); } among_seen[among_number] = 1; } #endif while (1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; /* smaller */ w = v + k; { int i2; for (i2 = common; i2 < w->s_size; i2++) { if (c + common == l) { diff = -1; break; } diff = q[common] - w->s[i2]; if (diff != 0) break; common++; } } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; /* v->s has been inspected */ if (j == i) break; /* only one item in v */ /* - but now we need to go round once more to get v->s inspected. This looks messy, but is actually the optimal approach. */ if (first_key_inspected) break; first_key_inspected = 1; } } w = v + i; while (1) { if (common_i >= w->s_size) { z->c = c + w->s_size; #ifdef SNOWBALL_COVERAGE fprintf(stderr, "%s: among %d : %d of %d string '%.*s'\n", w[v_size].s, among_number, w[v_size].result, v_size, w->s_size, w->s); #endif if (!w->function) return w->result; z->af = w->function; if (call_among_func(z)) { z->c = c + w->s_size; #ifdef SNOWBALL_COVERAGE fprintf(stderr, "%s: among %d : %d of %d func-t '%.*s'\n", w[v_size].s, among_number, w[v_size].result, v_size, w->s_size, w->s); #endif return w->result; } #ifdef SNOWBALL_COVERAGE fprintf(stderr, "%s: among %d : %d of %d func-f '%.*s'\n", w[v_size].s, among_number, w[v_size].result, v_size, w->s_size, w->s); #endif } if (!w->substring_i) { #ifdef SNOWBALL_COVERAGE fprintf(stderr, "%s: among %d no match\n", v[v_size * 2].s, among_number); #endif return 0; } w += w->substring_i; } } /* find_among_b is for backwards processing. Same comments apply */ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size, int (*call_among_func)(struct SN_env*)) { int i = 0; int j = v_size; int c = z->c; int lb = z->lb; const symbol * q = z->p + c - 1; const struct among * w; int common_i = 0; int common_j = 0; int first_key_inspected = 0; #ifdef SNOWBALL_COVERAGE int among_number = v[v_size].s_size; if (among_number < (int)sizeof(among_seen) && among_seen[among_number] == 0) { /* Report every entry once, then unused cases will appear (and we can * decrement each count when generating the coverage report). */ int k; for (k = 0; k < v_size; ++k) { w = v + k; fprintf(stderr, "%s: among %d : %d of %d string '%.*s'\n", w[v_size].s, among_number, w[v_size].result, v_size, w->s_size, w->s); } /* If the among matches the empty string without a gating function then * the "no match" case is impossible and so not useful to include in a * coverage report. */ if (v[v_size * 2].s_size != -1) { fprintf(stderr, "%s: among %d no match\n", v[v_size * 2].s, among_number); } among_seen[among_number] = 1; } #endif while (1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; w = v + k; { int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = q[- common] - w->s[i2]; if (diff != 0) break; common++; } } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = 1; } } w = v + i; while (1) { if (common_i >= w->s_size) { z->c = c - w->s_size; #ifdef SNOWBALL_COVERAGE fprintf(stderr, "%s: among %d : %d of %d string '%.*s'\n", w[v_size].s, among_number, w[v_size].result, v_size, w->s_size, w->s); #endif if (!w->function) return w->result; z->af = w->function; if (call_among_func(z)) { #ifdef SNOWBALL_COVERAGE fprintf(stderr, "%s: among %d : %d of %d func-t '%.*s'\n", w[v_size].s, among_number, w[v_size].result, v_size, w->s_size, w->s); #endif z->c = c - w->s_size; return w->result; } #ifdef SNOWBALL_COVERAGE fprintf(stderr, "%s: among %d : %d of %d func-f '%.*s'\n", w[v_size].s, among_number, w[v_size].result, v_size, w->s_size, w->s); #endif } if (!w->substring_i) { #ifdef SNOWBALL_COVERAGE fprintf(stderr, "%s: among %d no match\n", v[v_size * 2].s, among_number); #endif return 0; } w += w->substring_i; } } /* Increase the size of the buffer pointed to by p to at least n symbols. * On success, returns 0. If insufficient memory, returns -1. */ static int increase_size(symbol ** p, int n) { int new_size = n + 20; void * mem = realloc((char *) *p - HEAD, HEAD + (new_size + 1) * sizeof(symbol)); symbol * q; if (mem == NULL) return -1; q = (symbol *) (HEAD + (char *)mem); CAPACITY(q) = new_size; *p = q; return 0; } /* to replace symbols between c_bra and c_ket in z->p by the s_size symbols at s. Returns 0 on success, -1 on error. */ extern SNOWBALL_ERR replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s) { int adjustment = s_size - (c_ket - c_bra); if (adjustment != 0) { int len = SIZE(z->p); if (adjustment + len > CAPACITY(z->p)) { SNOWBALL_PROPAGATE_ERR(increase_size(&z->p, adjustment + len)); } memmove(z->p + c_ket + adjustment, z->p + c_ket, (len - c_ket) * sizeof(symbol)); SET_SIZE(z->p, adjustment + len); z->l += adjustment; if (z->c >= c_ket) z->c += adjustment; else if (z->c > c_bra) z->c = c_bra; } if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); SNOWBALL_RETURN_OK; } # define REPLACE_S(Z, B, K, SIZE, S) \ SNOWBALL_PROPAGATE_ERR(replace_s(Z, B, K, SIZE, S)) static SNOWBALL_ERR slice_check(struct SN_env * z) { if (z->bra < 0 || z->bra > z->ket || z->ket > z->l || z->l > SIZE(z->p)) /* this line could be removed */ { #if 0 fprintf(stderr, "faulty slice operation:\n"); debug(z, -1, 0); #endif SNOWBALL_RETURN_OR_THROW(-1, std::logic_error("Snowball slice invalid")); } SNOWBALL_RETURN_OK; } # define SLICE_CHECK(Z) SNOWBALL_PROPAGATE_ERR(slice_check(Z)) extern SNOWBALL_ERR slice_from_s(struct SN_env * z, int s_size, const symbol * s) { SLICE_CHECK(z); REPLACE_S(z, z->bra, z->ket, s_size, s); z->ket = z->bra + s_size; SNOWBALL_RETURN_OK; } extern SNOWBALL_ERR slice_from_v(struct SN_env * z, const symbol * p) { return slice_from_s(z, SIZE(p), p); } extern SNOWBALL_ERR slice_del(struct SN_env * z) { SLICE_CHECK(z); { int slice_size = z->ket - z->bra; if (slice_size != 0) { int len = SIZE(z->p); memmove(z->p + z->bra, z->p + z->ket, (len - z->ket) * sizeof(symbol)); SET_SIZE(z->p, len - slice_size); z->l -= slice_size; if (z->c >= z->ket) z->c -= slice_size; else if (z->c > z->bra) z->c = z->bra; z->ket = z->bra; } } SNOWBALL_RETURN_OK; } extern SNOWBALL_ERR insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) { REPLACE_S(z, bra, ket, s_size, s); if (bra <= z->ket) { int adjustment = s_size - (ket - bra); z->ket += adjustment; if (bra <= z->bra) z->bra += adjustment; } SNOWBALL_RETURN_OK; } extern SNOWBALL_ERR insert_v(struct SN_env * z, int bra, int ket, const symbol * p) { return insert_s(z, bra, ket, SIZE(p), p); } extern SNOWBALL_ERR slice_to(struct SN_env * z, symbol ** p) { SLICE_CHECK(z); { int len = z->ket - z->bra; if (CAPACITY(*p) < len) { SNOWBALL_PROPAGATE_ERR(increase_size(p, len)); } memmove(*p, z->p + z->bra, len * sizeof(symbol)); SET_SIZE(*p, len); } SNOWBALL_RETURN_OK; } extern SNOWBALL_ERR assign_to(struct SN_env * z, symbol ** p) { int len = z->l; if (CAPACITY(*p) < len) { SNOWBALL_PROPAGATE_ERR(increase_size(p, len)); } memmove(*p, z->p, len * sizeof(symbol)); SET_SIZE(*p, len); SNOWBALL_RETURN_OK; } extern int len_utf8(const symbol * p) { int size = SIZE(p); int len = 0; while (size--) { symbol b = *p++; if (b >= 0xC0 || b < 0x80) ++len; } return len; } snowball-3.1.0/rust/000077500000000000000000000000001520373054300143335ustar00rootroot00000000000000snowball-3.1.0/rust/Cargo.toml000066400000000000000000000002101520373054300162540ustar00rootroot00000000000000[package] name = "testapp" version = "0.1.0" authors = ["Jakob Demler "] build = "build.rs" [dependencies] snowball-3.1.0/rust/build.rs000066400000000000000000000037561520373054300160130ustar00rootroot00000000000000use std::env; use std::fs; use std::fs::{OpenOptions}; use std::io::Write; use std::path::Path; // This build script makes the code independent from the algorithms declared // in the makefile. // We check which stemmers were generated and then produce the corresponding // includes for src/algorithms/mod.rs and a closure for src/main.rs to match // strings to stemmers fn main() { let out_dir = env::var("OUT_DIR").unwrap(); let lang_match_path = Path::new(&out_dir).join("lang_matches.rs"); let lang_include_path = Path::new(&out_dir).join("lang_include.rs"); let mut lang_match_file = OpenOptions::new().write(true).create(true).truncate(true).open(&lang_match_path).unwrap(); let mut lang_include_file = OpenOptions::new().write(true).create(true).truncate(true).open(&lang_include_path).unwrap(); let src_dir = Path::new(&env::var("CARGO_MANIFEST_DIR").unwrap()).join("src"); let algo_dir = src_dir.join("snowball/algorithms"); lang_match_file.write_all(b" move |lang:String|{ match lang.as_str() {") .unwrap(); for file in fs::read_dir(&algo_dir).unwrap() { let file = file.unwrap(); let path = file.path(); let filestem = path.file_stem().unwrap().to_str().unwrap(); if path.is_file() && filestem != "mod" { //Also we need to copy all the stemmer files into OUT_DIR... fs::copy(&path, Path::new(&out_dir).join(file.file_name())).unwrap(); let split = filestem.len() - 8; let langname = &filestem[..split]; writeln!(&mut lang_match_file, "\"{}\" => Stemmer {{ stemmer: snowball::algorithms::{}_stemmer::stem}},", langname, langname) .unwrap(); writeln!(&mut lang_include_file, "pub mod {}_stemmer;", langname).unwrap(); } } lang_match_file.write_all(b" x => panic!(\"Unknown algorithm '{}'\", x) } } ") .unwrap(); } snowball-3.1.0/rust/rust-pre-1.27-compat.patch000066400000000000000000000024201520373054300210010ustar00rootroot00000000000000Applying this patch restores compatibility with Rust < 1.27 (but causes newer versions to report "warning: trait objects without an explicit `dyn` are deprecated"). diff --git a/rust/src/main.rs b/rust/src/main.rs index 064325a9..bf752795 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -56,9 +56,9 @@ fn main() { let mut output = if let Some(output_file) = output_arg { - Box::new(File::create(Path::new(&output_file)).unwrap()) as Box + Box::new(File::create(Path::new(&output_file)).unwrap()) as Box } else { - Box::new(std::io::stdout()) as Box + Box::new(std::io::stdout()) as Box }; if let Some(input_file) = input_arg { diff --git a/rust/src/snowball/among.rs b/rust/src/snowball/among.rs index 57fc8bae..70631933 100644 --- a/rust/src/snowball/among.rs +++ b/rust/src/snowball/among.rs @@ -3,4 +3,4 @@ use snowball::SnowballEnv; pub struct Among(pub &'static str, pub i32, pub i32, - pub Option<&'static (dyn Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>); + pub Option<&'static (Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>); snowball-3.1.0/rust/src/000077500000000000000000000000001520373054300151225ustar00rootroot00000000000000snowball-3.1.0/rust/src/main.rs000066400000000000000000000063041520373054300164170ustar00rootroot00000000000000use std::fs::File; use std::io::{BufRead, BufReader, Write}; use std::path::Path; use std::env; use std::borrow::Cow; pub mod snowball; use snowball::SnowballEnv; fn usage(name: &str) { println!("{} -l [-i ] [-o ] The input file consists of a list of words to be stemmed, one per line. Words should be in lower case, but (for English) A-Z letters are mapped to their a-z equivalents anyway. If omitted, stdin is used.", name); } fn main() { let args: Vec = env::args().collect(); if args.len() < 3 { usage(&args[0]); } else { let mut language = None; let mut input_arg = None; let mut output_arg = None; let mut i = 1; while i < args.len() { match args[i].as_str() { "-l" => { language = Some(args[i+1].clone()); i += 2; }, "-i" => { input_arg = Some(args[i+1].clone()); i += 2; }, "-o" => { output_arg = Some(args[i+1].clone()); i += 2; }, x => { println!("Unrecognized option '{}'", x); usage(&args[0]); return } } } if language.is_none() { println!("Please specify a language!"); usage(&args[0]); return; } let stemmer = Stemmer::create(language.unwrap()); let mut output = if let Some(output_file) = output_arg { Box::new(File::create(Path::new(&output_file)).unwrap()) as Box } else { Box::new(std::io::stdout()) as Box }; if let Some(input_file) = input_arg { for line in BufReader::new(File::open(Path::new(&input_file)).unwrap()).lines() { writeln!(&mut output, "{}", stemmer.stem(&line.unwrap())).unwrap(); } } else { let stdin = std::io::stdin(); for line in stdin.lock().lines() { writeln!(&mut output, "{}", stemmer.stem(&line.unwrap())).unwrap(); } } } } /// Wraps a usable interface around the actual stemmer implementation pub struct Stemmer { stemmer: fn(&mut SnowballEnv) -> bool, } impl Stemmer { /// Create a new stemmer from an algorithm pub fn create(lang: String) -> Self { // Have a look at ../build.rs // There we generate a file that is rust code for a closure that returns a stemmer. // We match against all the algorithms in src/snowball/algoritms/ folder. // Alas, this cannot be included as a match statement or function because of Rust's // hygenic macros. let match_language = include!(concat!(env!("OUT_DIR"), "/lang_matches.rs")); match_language(lang) } /// Stem a single word /// Please note, that the input is expected to be all lowercase (if that is applicable). pub fn stem<'a>(&self, input: &'a str) -> Cow<'a, str> { let mut env = SnowballEnv::create(input); (self.stemmer)(&mut env); env.get_current() } } snowball-3.1.0/rust/src/snowball/000077500000000000000000000000001520373054300167435ustar00rootroot00000000000000snowball-3.1.0/rust/src/snowball/algorithms/000077500000000000000000000000001520373054300211145ustar00rootroot00000000000000snowball-3.1.0/rust/src/snowball/algorithms/mod.rs000066400000000000000000000001231520373054300222350ustar00rootroot00000000000000// Have a look at build.rs include!(concat!(env!("OUT_DIR"), "/lang_include.rs")); snowball-3.1.0/rust/src/snowball/among.rs000066400000000000000000000003751520373054300204170ustar00rootroot00000000000000use snowball::SnowballEnv; pub struct Among(pub &'static str, pub i32, pub i32, pub Option<&'static (dyn Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>); snowball-3.1.0/rust/src/snowball/mod.rs000066400000000000000000000001771520373054300200750ustar00rootroot00000000000000pub mod algorithms; mod among; mod snowball_env; pub use snowball::among::Among; pub use snowball::snowball_env::SnowballEnv; snowball-3.1.0/rust/src/snowball/snowball_env.rs000066400000000000000000000364051520373054300220120ustar00rootroot00000000000000use std::borrow::Cow; use snowball::Among; #[derive(Debug, Clone)] pub struct SnowballEnv<'a> { pub current: Cow<'a, str>, pub cursor: i32, pub limit: i32, pub limit_backward: i32, pub bra: i32, pub ket: i32, } impl<'a> SnowballEnv<'a> { pub fn create(value: &'a str) -> Self { let len = value.len(); SnowballEnv { current: Cow::from(value), cursor: 0, limit: len as i32, limit_backward: 0, bra: 0, ket: len as i32, } } pub fn get_current(self) -> Cow<'a, str> { self.current } pub fn set_current(&mut self, current: &'a str) { self.current = Cow::from(current); } pub fn set_current_s(&mut self, current: String) { self.current = Cow::from(current); } fn replace_s(&mut self, bra: i32, ket: i32, s: &str) -> i32 { let adjustment = s.len() as i32 - (ket - bra); let mut result = String::with_capacity(self.current.len()); { let (lhs, _) = self.current.split_at(bra as usize); let (_, rhs) = self.current.split_at(ket as usize); result.push_str(lhs); result.push_str(s); result.push_str(rhs); } // ... not very nice... let new_lim = self.limit + adjustment; self.limit = new_lim; if self.cursor >= ket { let new_cur = self.cursor + adjustment; self.cursor = new_cur; } else if self.cursor > bra { self.cursor = bra } self.current = Cow::from(result); adjustment } /// Check if s is after cursor. /// If so, move cursor to the end of s pub fn eq_s(&mut self, s: &str) -> bool { if self.cursor >= self.limit { return false; } if self.current[(self.cursor as usize)..].starts_with(s) { self.cursor += s.len() as i32; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor += 1; } true } else { false } } /// Check if 's' is before cursor /// If so, move cursor to the beginning of s pub fn eq_s_b(&mut self, s: &str) -> bool { if (self.cursor - self.limit_backward) < s.len() as i32 { false // Check if cursor -s.len is a char boundary. if not well... return false obv } else if !self.current.is_char_boundary(self.cursor as usize - s.len()) || !self.current[self.cursor as usize - s.len()..].starts_with(s) { false } else { self.cursor -= s.len() as i32; true } } /// Replace string between `bra` and `ket` with s pub fn slice_from(&mut self, s: &str) { let (bra, ket) = (self.bra, self.ket); self.replace_s(bra, ket, s); self.ket = bra + s.len() as i32; } /// Move cursor to next character pub fn next_char(&mut self) { self.cursor += 1; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor += 1; } } /// Move cursor to previous character pub fn previous_char(&mut self) { self.cursor -= 1; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor -= 1; } } pub fn hop(&mut self, mut delta: i32) -> bool { let mut res = self.cursor; while delta > 0 { delta -= 1; if res >= self.limit { return false; } res += 1; while res < self.limit && !self.current.is_char_boundary(res as usize) { res += 1; } } self.cursor = res; return true; } pub fn hop_checked(&mut self, delta: i32) -> bool { return delta >= 0 && self.hop(delta); } pub fn hop_back(&mut self, mut delta: i32) -> bool { let mut res = self.cursor; while delta > 0 { delta -= 1; if res <= self.limit_backward { return false; } res -= 1; while res > self.limit_backward && !self.current.is_char_boundary(res as usize) { res -= 1; } } self.cursor = res; return true; } pub fn hop_back_checked(&mut self, delta: i32) -> bool { return delta >= 0 && self.hop_back(delta); } // A grouping is represented by a minimum code point, a maximum code point, // and a bitfield of which code points in that range are in the grouping. // For example, in english.sbl, valid_LI is 'cdeghkmnrt'. // The minimum and maximum code points are 99 and 116, // so every time one of these grouping functions is called for g_valid_LI, // min must be 99 and max must be 116. There are 18 code points within that // range (inclusive) so the grouping is represented with 18 bits, plus 6 bits of padding: // // cdefghij klmnopqr st // 11101100 10110001 01000000 // // The first bit is the least significant. // Those three bytes become &[0b00110111, 0b10001101, 0b00000010], // which is &[55, 141, 2], which is how g_valid_LI is defined in english.rs. /// Check if the char the cursor points to is in the grouping pub fn in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor >= self.limit { return false; } if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { return false; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { return false; } self.next_char(); return true; } return false; } pub fn go_in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { while self.cursor < self.limit { if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { return true; } self.next_char(); } else { return false; } } return false; } pub fn in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor <= self.limit_backward { return false; } let c = self.cursor; self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { self.cursor = c; return false; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { self.cursor = c; return false; } return true; } return false; } pub fn go_in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { while self.cursor > self.limit_backward { let c = self.cursor; self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { self.cursor = c; return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { self.cursor = c; return true; } } else { return false; } } return false; } pub fn out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor >= self.limit { return false; } if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { self.next_char(); return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { self.next_char(); return true; } } return false; } pub fn go_out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { while self.cursor < self.limit { if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch <= max && ch >= min { ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) != 0 { return true; } } self.next_char(); } else { return false; } } return false; } pub fn out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor <= self.limit_backward { return false; } let c = self.cursor; self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { return true; } self.cursor = c; } return false; } pub fn go_out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { while self.cursor > self.limit_backward { let c = self.cursor; self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch <= max && ch >= min { ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) != 0 { self.cursor = c; return true; } } } else { return false; } } return false; } /// Helper function that removes the string slice between `bra` and `ket` pub fn slice_del(&mut self) { self.slice_from("") } pub fn insert(&mut self, bra: i32, ket: i32, s: &str) { let adjustment = self.replace_s(bra, ket, s); if bra <= self.bra { self.bra = self.bra + adjustment; } if bra <= self.ket { self.ket = self.ket + adjustment; } } pub fn assign_to(&mut self) -> String { self.current[0..self.limit as usize].to_string() } pub fn slice_to(&mut self) -> String { self.current[self.bra as usize..self.ket as usize].to_string() } pub fn find_among(&mut self, amongs: &[Among], context: &mut T) -> i32 { use std::cmp::min; let mut i: i32 = 0; let mut j: i32 = amongs.len() as i32; let c = self.cursor; let l = self.limit; let mut common_i = 0i32; let mut common_j = 0i32; let mut first_key_inspected = false; loop { let k = i + ((j - i) >> 1); let mut diff: i32 = 0; let mut common = min(common_i, common_j); let w = &amongs[k as usize]; for lvar in common..w.0.len() as i32 { if c + common == l { diff = -1; break; } diff = self.current.as_bytes()[(c + common) as usize] as i32 - w.0.as_bytes()[lvar as usize] as i32; if diff != 0 { break; } common += 1; } if diff < 0 { j = k; common_j = common; } else { i = k; common_i = common; } if j - i <= 1 { if i > 0 { break; } if j == i { break; } if first_key_inspected { break; } first_key_inspected = true; } } loop { let w = &amongs[i as usize]; if common_i >= w.0.len() as i32{ self.cursor = c + w.0.len() as i32; if let Some(ref method) = w.3 { if method(self, context) { self.cursor = c + w.0.len() as i32; return w.2; } } else { return w.2; } } i = w.1; if i < 0 { return 0; } } } pub fn find_among_b(&mut self, amongs: &[Among], context: &mut T) -> i32 { let mut i: i32 = 0; let mut j: i32 = amongs.len() as i32; let c = self.cursor; let lb = self.limit_backward; let mut common_i = 0i32; let mut common_j = 0i32; let mut first_key_inspected = false; loop { let k = i + ((j - i) >> 1); let mut diff: i32 = 0; let mut common = if common_i < common_j { common_i } else { common_j }; let w = &amongs[k as usize]; for lvar in (0..w.0.len() - common as usize).rev() { if c - common == lb { diff = -1; break; } diff = self.current.as_bytes()[(c - common - 1) as usize] as i32 - w.0.as_bytes()[lvar] as i32; if diff != 0 { break; } // Count up commons. But not one character but the byte width of that char common += 1; } if diff < 0 { j = k; common_j = common; } else { i = k; common_i = common; } if j - i <= 1 { if i > 0 { break; } if j == i { break; } if first_key_inspected { break; } first_key_inspected = true; } } loop { let w = &amongs[i as usize]; if common_i >= w.0.len() as i32 { self.cursor = c - w.0.len() as i32; if let Some(ref method) = w.3 { if method(self, context) { self.cursor = c - w.0.len() as i32; return w.2; } } else { return w.2; } } i = w.1; if i < 0 { return 0; } } } } snowball-3.1.0/tests/000077500000000000000000000000001520373054300145005ustar00rootroot00000000000000snowball-3.1.0/tests/compilertest000077500000000000000000000023701520373054300171420ustar00rootroot00000000000000#!/bin/sh set -e SNOWBALL='../snowball -o tmp' r=0 # Tests for expected compiler errors. set x errors/*.sbl shift echo "1..$#" for t ; do b=`echo "$t"|sed 's/\.sbl$//'` f= # Run with -syntax to avoid generating files if we get none of the expected # errors. if $SNOWBALL -syntax "$t" 2> tmp.stderr > tmp.syntax ; then f="snowball compiler did not fail" else if [ -f "$b.stderr" ] ; then if ! diff "$b.stderr" tmp.stderr ; then f="stderr output not as expected" fi fi fi if [ -z "$f" ] ; then echo "ok - $b" else echo "not ok - $b: $f" r=1 fi done # Tests which check --syntax output. set x syntax/*.sbl shift echo "1..$#" for t ; do b=`echo "$t"|sed 's/\.sbl$//'` f= if $SNOWBALL -syntax "$t" 2> tmp.stderr > tmp.syntax ; then if [ -f "$b.stderr" ] ; then if ! diff "$b.stderr" tmp.stderr ; then f="stderr output not as expected" fi fi if [ -z "$f" ] && [ -f "$b.syntax" ] ; then if ! diff "$b.syntax" tmp.syntax ; then f="syntax tree not as expected" else rm tmp.syntax fi fi else f="snowball compiler failed" fi if [ -z "$f" ] ; then echo "ok - $b" rm -f tmp.stderr tmp.syntax else echo "not ok - $b: $f" r=1 fi done exit $r snowball-3.1.0/tests/errors/000077500000000000000000000000001520373054300160145ustar00rootroot00000000000000snowball-3.1.0/tests/errors/ae-errors.sbl000066400000000000000000000006441520373054300204210ustar00rootroot00000000000000externals (stem) integers (x) define stem as ( // One token per line to check which token the line number is taken from. $x = 1 / // Error here 0 $x /= // Error here 0 $ ( x = // Error here 0 ) $ stem // Error here = 0 $ unknown // Error here = 0 $ ( x => // Error here 0 ) $ ( x /= // Error here 0 ) $ ( x += // Error here 0 ) ) snowball-3.1.0/tests/errors/ae-errors.stderr000066400000000000000000000007751520373054300211510ustar00rootroot00000000000000errors/ae-errors.sbl:9: Division by zero errors/ae-errors.sbl:13: Division by zero errors/ae-errors.sbl:19: Expected relational operator, got '=' (did you mean '=='?) errors/ae-errors.sbl:24: 'stem' not of type integer or string errors/ae-errors.sbl:29: 'unknown' undeclared errors/ae-errors.sbl:36: Expected relational operator, got '=>' (did you mean '>='?) errors/ae-errors.sbl:43: Expected relational operator, got '/=' (did you mean '!='?) errors/ae-errors.sbl:50: Expected relational operator, got '+=' snowball-3.1.0/tests/errors/bad-dollar.sbl000066400000000000000000000002111520373054300205110ustar00rootroot00000000000000strings ( x ) externals ( stem ) booleans ( b ) groupings ( g ) define stem as ( $ = 'xxx' $stem = 1 $b = 1 $g = 1 ) snowball-3.1.0/tests/errors/bad-dollar.stderr000066400000000000000000000004011520373054300212350ustar00rootroot00000000000000errors/bad-dollar.sbl:10: unexpected = in integer test expression after $ errors/bad-dollar.sbl:11: 'stem' not of type integer or string errors/bad-dollar.sbl:12: 'b' not of type integer or string errors/bad-dollar.sbl:13: 'g' not of type integer or string snowball-3.1.0/tests/errors/bad-grouping-definition.sbl000066400000000000000000000002501520373054300232170ustar00rootroot00000000000000groupings (g1 g2 g3 g4 gempty gself) define g1 define g2 'abc' - define g3 externals (stem) define stem as () define gempty 'a' - 'a' define gself gself define g4 stem snowball-3.1.0/tests/errors/bad-grouping-definition.stderr000066400000000000000000000007141520373054300237470ustar00rootroot00000000000000errors/bad-grouping-definition.sbl:3: unexpected define in grouping definition after name errors/bad-grouping-definition.sbl:4: unexpected define in grouping definition after - errors/bad-grouping-definition.sbl:5: unexpected externals in grouping definition after name errors/bad-grouping-definition.sbl:7: empty grouping errors/bad-grouping-definition.sbl:8: gself defined in terms of itself errors/bad-grouping-definition.sbl:9: 'stem' not of type grouping snowball-3.1.0/tests/errors/missing-bra.sbl000066400000000000000000000001011520373054300207210ustar00rootroot00000000000000strings externals (stem) backwardmode define stem as ( among ) snowball-3.1.0/tests/errors/missing-bra.stderr000066400000000000000000000002311520373054300214500ustar00rootroot00000000000000errors/missing-bra.sbl:2: ( omitted after strings errors/missing-bra.sbl:4: ( omitted after backwardmode errors/missing-bra.sbl:6: ( omitted after among snowball-3.1.0/tests/errors/missing-command.sbl000066400000000000000000000004121520373054300216000ustar00rootroot00000000000000// Regression test for segfault introduced and fixed in git before 3.1.0. externals (stem) define stem as ( 'q' or ) // Also, we swallowed the token for the `)` and emitted an bogus extra error // here (this bug was present in releases): // // ) omitted after ) snowball-3.1.0/tests/errors/missing-command.stderr000066400000000000000000000000641520373054300223260ustar00rootroot00000000000000errors/missing-command.sbl:4: unexpected ) after or snowball-3.1.0/tests/errors/missing-ket-backwardmode.sbl000066400000000000000000000000761520373054300233740ustar00rootroot00000000000000externals (stem) routines (f) backwardmode ( define f as true snowball-3.1.0/tests/errors/missing-ket-backwardmode.stderr000066400000000000000000000000741520373054300241150ustar00rootroot00000000000000errors/missing-ket-backwardmode.sbl:4: ) omitted after true snowball-3.1.0/tests/errors/missing-ket.sbl000066400000000000000000000001541520373054300207500ustar00rootroot00000000000000externals (stem integers (i define stem as ( non-() $(cursor > 0 $i = (1 + 2 hop 5 [tolimit] delete snowball-3.1.0/tests/errors/missing-ket.stderr000066400000000000000000000004401520373054300214710ustar00rootroot00000000000000errors/missing-ket.sbl:2: ) omitted after name errors/missing-ket.sbl:3: ) omitted after name errors/missing-ket.sbl:4: name omitted after - errors/missing-ket.sbl:6: ) omitted after number errors/missing-ket.sbl:7: ) omitted after number errors/missing-ket.sbl:8: ) omitted after delete snowball-3.1.0/tests/errors/notdefined.sbl000066400000000000000000000001011520373054300206250ustar00rootroot00000000000000externals (stem) groupings (g) routines (r) define stem as (g r) snowball-3.1.0/tests/errors/notdefined.stderr000066400000000000000000000001751520373054300213630ustar00rootroot00000000000000errors/notdefined.sbl:4: routine 'r' declared but not defined errors/notdefined.sbl:4: grouping 'g' declared but not defined snowball-3.1.0/tests/errors/string-omitted.sbl000066400000000000000000000000611520373054300214640ustar00rootroot00000000000000externals (stem) define stem as ( <- hop 2 ) snowball-3.1.0/tests/errors/undeclared.sbl000066400000000000000000000005171520373054300206270ustar00rootroot00000000000000// Regression test for bug reporting names. externals (stem) booleans (rb longbool rb) groupings (rg longgroup rg) integers (ri longint ri) strings (rs longstring rs) define rg 'rg' define longgroup 'aeiou' define rg 'xy' define stem as ( set longbool set xb $longint = 1 $xi = 1 ri $longstring = '' $(sizeof xs > 0) ) snowball-3.1.0/tests/errors/undeclared.stderr000066400000000000000000000006051520373054300213500ustar00rootroot00000000000000errors/undeclared.sbl:3: 'rb' re-declared errors/undeclared.sbl:4: 'rg' re-declared errors/undeclared.sbl:5: 'ri' re-declared errors/undeclared.sbl:6: 'rs' re-declared errors/undeclared.sbl:10: 'rg' redefined errors/undeclared.sbl:14: 'xb' undeclared errors/undeclared.sbl:16: 'xi' undeclared errors/undeclared.sbl:17: integer name 'ri' misplaced errors/undeclared.sbl:19: 'xs' undeclared snowball-3.1.0/tests/errors/unexpected-token.sbl000066400000000000000000000004301520373054300217750ustar00rootroot00000000000000externals (stem) integers (i) groupings (g) define g + define stem as ( $ next $(i 0) $i = hop 2 $(i = ) backwardmode (next) $(i = 1) $(i /= 1) $(i => 1) $(i += 1) $(i -= 1) $(i *= 1) ) next strings (s1 s2) define s1 'string' define s2 as 'string' define snowball-3.1.0/tests/errors/unexpected-token.stderr000066400000000000000000000024041520373054300225230ustar00rootroot00000000000000errors/unexpected-token.sbl:4: unexpected + in grouping definition after name errors/unexpected-token.sbl:7: unexpected next in integer test expression after $ errors/unexpected-token.sbl:8: unexpected number 0 in integer test expression after name errors/unexpected-token.sbl:9: unexpected hop in integer expression after = errors/unexpected-token.sbl:10: Expected relational operator, got '=' (did you mean '=='?) errors/unexpected-token.sbl:10: unexpected ) in integer expression after = errors/unexpected-token.sbl:11: unexpected backwardmode after ) errors/unexpected-token.sbl:12: Expected relational operator, got '=' (did you mean '=='?) errors/unexpected-token.sbl:13: Expected relational operator, got '/=' (did you mean '!='?) errors/unexpected-token.sbl:14: Expected relational operator, got '=>' (did you mean '>='?) errors/unexpected-token.sbl:15: Expected relational operator, got '+=' errors/unexpected-token.sbl:16: Expected relational operator, got '-=' errors/unexpected-token.sbl:17: Expected relational operator, got '*=' errors/unexpected-token.sbl:19: unexpected next after ) errors/unexpected-token.sbl:22: 's1' not of type grouping errors/unexpected-token.sbl:23: 's2' not of type routine or external errors/unexpected-token.sbl:25: name omitted after define snowball-3.1.0/tests/errors/wrongdirection.sbl000066400000000000000000000003161520373054300215530ustar00rootroot00000000000000externals ( stem ) routines ( remove_suffix ) backwardmode ( define remove_suffix as ( [substring] among ( 'al' 'ance' 'ence' 'er' 'ic' (delete) ) ) ) define stem as ( remove_suffix ) snowball-3.1.0/tests/errors/wrongdirection.stderr000066400000000000000000000001261520373054300222750ustar00rootroot00000000000000errors/wrongdirection.sbl:13: routine 'remove_suffix' mis-used in string forward mode snowball-3.1.0/tests/runtime/000077500000000000000000000000001520373054300161635ustar00rootroot00000000000000snowball-3.1.0/tests/runtime/among.sbl000066400000000000000000000003421520373054300177650ustar00rootroot00000000000000// Test among. externals (stem) routines (gate) stringescapes {} define gate as 'q' define stem as ( test ( among ( 'x' () '' gate () ) ='gated default among case did not fail as expected' ) ) snowball-3.1.0/tests/runtime/arithmeticexpr.sbl000066400000000000000000000045761520373054300217310ustar00rootroot00000000000000// Test arithmetic expressions (AEs). externals (stem) routines (operators pseudovars) integers (x y) define stem as ( operators pseudovars ) define operators as ( $x = 3 * 2 $x *= 5 $(x * 7 == 210) or fail='multiplication gave wrong answer' $x = 1 + 2 $x += 4 $(x + 8 == 15) or fail='addition gave wrong answer' $x = 1 - 2 $x -= 4 $(8 - x == 13) or fail='subtraction gave wrong answer' $(-x == 5) or fail='negation gave wrong answer' // Test division truncates towards zero. $x = 7 $x /= 4 $(x == 1) or fail='/= should truncate towards zero (+/+)' $y = 7 $x = y / 4 $(x == 1) or fail='/ should truncate towards zero (+/+)' $(7 / 4 * 4 == 4) or fail='should use integer / (+/+)' $x = -7 $x /= -4 $(x == 1) or fail='/= should truncate towards zero (-/-)' $y = -7 $x = y / -4 $(x == 1) or fail='/ should truncate towards zero (-/-)' $(-7 / -4 * 4 == 4) or fail='should use integer / (-/-)' $x = -7 $x /= 4 $(x == -1) or fail='/= should truncate towards zero (-/+)' $y = -7 $x = y / 4 $(x == -1) or fail='/ should truncate towards zero (-/+)' $(-7 / 4 * 4 == -4) or fail='should use integer / (-/+)' $x = 7 $x /= -4 $(x == -1) or fail='/= should truncate towards zero (+/-)' $y = 7 $x = y / -4 $(x == -1) or fail='/ should truncate towards zero (+/-)' $(7 / -4 * 4 == -4) or fail='should use integer / (+/-)' // Some target languages warn or even error for self-assignment. $x = x $(x == -1) or fail='self assignment changed value' $x = cursor tomark cursor $(x == cursor) or fail='tomark cursor moved cursor' ) define pseudovars as ( $(cursor == 0) or fail='cursor should be 0' $(limit == 2) or fail='limit should be 2' test (next $(cursor == 1)) or fail='cursor should be 1' test (hop 2 $(cursor == 2)) or fail='cursor should be 2' setlimit next for $(limit == 1) or fail='limit should be 1' test (reverse $(cursor == 0)) or fail='cursor should be 0 (reverse)' test (reverse $(limit == 0)) or fail='limit should be 0 (reverse)' test (next reverse $(limit == 0)) or fail='limit should be 0 (reverse)' test (next reverse $(cursor == 1)) or fail='cursor should be 1 (reverse)' test (backwards $(cursor == 2)) or fail='cursor should be 2 (backwards)' test (backwards $(limit == 0)) or fail='limit should be 0 (backwards)' test (next backwards $(limit == 1)) or fail='limit should be 1 (backwards)' ) snowball-3.1.0/tests/runtime/attachinsert.sbl000066400000000000000000000016501520373054300213600ustar00rootroot00000000000000// Test `attach` and `insert`. externals (stem) define stem as ( ='abcd' do ( hop 2 attach '-' $(cursor == 2) or fail='forwards attach changed cursor' next ) $(cursor == 0) or fail='do failed to restore cursor after forwards attach' ='abcd' do ( hop 2 insert '-' $(cursor == 3) or fail='forwards insert failed to move cursor' next ) $(cursor == 0) or fail='do failed to restore cursor after forwards insert' ='abcd' backwards ( do ( hop 2 attach '-' $(cursor == 3) or fail='backwards attach failed to move cursor' next ) $(cursor == size) or fail='do failed to restore cursor after backwards attach' ) ='abcd' backwards ( do ( hop 2 insert '-' $(cursor == 2) or fail='backwards insert moved cursor' next ) $(cursor == size) or fail='do failed to restore cursor after backwards insert' ) ='ok' ) snowball-3.1.0/tests/runtime/booleans.sbl000066400000000000000000000004511520373054300204670ustar00rootroot00000000000000// Test strings externals (stem) booleans (b) stringescapes {} define stem as ( true or fail='true failed' not false or fail='false did not fail' set b b or fail='set b did not set b' unset b not b or fail='unset b did not unset b' set b b or fail='set b did not set b (2)' ) snowball-3.1.0/tests/runtime/externals.sbl000066400000000000000000000006641520373054300207000ustar00rootroot00000000000000// Test calling externals within the program. externals (stem e1 e2) define stem as ( not e2 or fail='failed to call external e2' test e1 or fail='external as among function failed' $(len == 4) or fail='external as among function result wrong' not e1 or fail='external as among function did not fail' ='ok' ) define e1 as ( [substring] among ( 'ok' e2 ( <-'oops' ) 'no' ( <-'nope' ) ) ) define e2 as atlimit snowball-3.1.0/tests/runtime/hop.sbl000066400000000000000000000004711520373054300174550ustar00rootroot00000000000000// Test `hop` and `next`. externals (stem) strings (s) define stem as ( next or fail='next failed' not (hop len-3) or fail='hop by calculated negative signalled t' test (tolimit not next or fail='next at end of string signalled t') not (hop len-cursor+1) or fail='hop past end of string signalled t' ) snowball-3.1.0/tests/runtime/integertests.sbl000066400000000000000000000017301520373054300214060ustar00rootroot00000000000000// Test integer tests. externals (stem) integers (x y) define stem as ( $x = len $x == 2 or fail='== failed (old style)' $(x == 2) or fail='== failed' $(2 == x) or fail='== failed (reversed)' $x != 3 or fail='!= failed (old style)' $(x != 3) or fail='!= failed' $(3 != x) or fail='!= failed (reversed)' $x >= 2 or fail='>= failed (old style)' $(x >= 2) or fail='>= failed' $(x >= 1) or fail='>= failed (nonequal)' $(2 >= x) or fail='>= failed (reversed)' $(3 >= x) or fail='>= failed (nonequal, reversed)' $x <= 2 or fail='<= failed (old style)' $(x <= 2) or fail='<= failed' $(x <= 3) or fail='<= failed (nonequal)' $(2 <= x) or fail='<= failed (reversed)' $(1 <= x) or fail='<= failed (nonequal, reversed)' $x > 1 or fail='> failed (old style)' $(x > 1) or fail='> failed' $(3 > x) or fail='> failed (nonequal, reversed)' $x < 3 or fail='< failed (old style)' $(x < 3) or fail='< failed' $(1 < x) or fail='< failed (reversed)' ) snowball-3.1.0/tests/runtime/intlimits.sbl000066400000000000000000000013331520373054300207010ustar00rootroot00000000000000// Test minint and maxint work as expected. externals (stem) integers (x y) define stem as ( $(minint < 0) or fail='minint should be negative' $(maxint > 0) or fail='maxint should be positive' $(minint <= -32767) or fail='minint should be <= -32767' $(maxint >= 32767) or fail='maxint should be >= 32767' $(minint + maxint == 0) or $(minint + maxint == -1) or fail='minint+maxint should be 0 or -1' $x = minint $(x == minint) or fail='minint cannot be stored in a variable' $y = x + 1 $(y - x == 1) or fail='Adding one to minint does not work' $x = maxint $(x == maxint) or fail='maxint cannot be stored in a variable' $y = x - 1 $(x - y == 1) or fail='Subtracting one from maxint does not work' ) snowball-3.1.0/tests/runtime/naming.sbl000066400000000000000000000010141520373054300201320ustar00rootroot00000000000000// Test target language naming problem cases. // Test handling of names which only differ by case (for e.g. Pascal). strings (notable NoTable NOTABLE) // Test handling of name ending `_`. // (Regression test for bug in Ada generator fixed in 3.1.0.) integers (x_ x_e x_ee) externals (stem) define stem as ( try ( [next] -> notable next] -> NoTable next] -> NOTABLE $x_ = lenof notable + lenof NoTable + lenof NOTABLE $x_ee = 2 $x_e = len $(x_ + x_ee < x_e) delete ) [tolimit] <- 'ok' ) snowball-3.1.0/tests/runtime/not.sbl000066400000000000000000000012101520373054300174570ustar00rootroot00000000000000// Test `not`. externals (stem) define stem as ( ='stopknot' test ( $(cursor == 0) or fail='cursor not 0 before test' next or fail='next failed' do ( not ([next] delete 'o') ) $(cursor == 1) or fail='do failed to restore cursor' ) backwards ( $(cursor == 7) or fail='cursor not 7 before test' next or fail='next failed' do ( not ([next] delete 'n') ) $(cursor == 5) or fail='backwards do failed to restore cursor' ) not (next fail='this should never be seen') $(cursor == 0) or fail='`not` failed to restore cursor' ='ok' not true and fail='`not true` raised signal t' ) snowball-3.1.0/tests/runtime/or.sbl000066400000000000000000000003611520373054300173050ustar00rootroot00000000000000// Test `or`. externals (stem) routines (a) integers (x) define a as try ( next $(x < 2) ) define stem as ( $x = limit a or fail='a did not signal t' $(cursor == 0) or fail='a did not restore cursor' ) snowball-3.1.0/tests/runtime/repeat.sbl000066400000000000000000000006721520373054300201520ustar00rootroot00000000000000// Test `repeat`. stringescapes { } externals ( stem ) define stem as ( ='ok' $(limit==size) or fail='limit not size by default' // Regression tests for Ada `repeat` codegen bug. // https://github.com/snowballstem/snowball/issues/275 setlimit tomark cursor+1 for repeat 'Q' $(limit==size) or fail='limit not restored after repeat' setlimit next for repeat 'Q' $(limit==size) or fail='limit not restored after repeat (2)' ) snowball-3.1.0/tests/runtime/setlimit.sbl000066400000000000000000000014371520373054300205240ustar00rootroot00000000000000// Test setlimit externals (stem) routines (setlimit_tomark) define stem as ( setlimit_tomark ='ok' ) define setlimit_tomark as ( // Regression test for bug fixed in 3.1.0. test ( ='abcd' next // Forwards `setlimit tomark` restored limit incorrectly. setlimit tomark 3 for $(limit == 3) or fail='setlimit tomark set wrong limit' $(cursor == 1) or fail='cursor changed after setlimit tomark' $(limit == 4) or fail='limit not restored after setlimit tomark' // Make sure restoring of limit copes with setting the limit to itself! setlimit tomark limit for $(limit == 4) or fail='setlimit tomark limit set wrong limit' $(cursor == 1) or fail='cursor changed after setlimit tomark' $(limit == 4) or fail='limit not restored after setlimit tomark' ) ) snowball-3.1.0/tests/runtime/sizelen.sbl000066400000000000000000000030431520373054300203360ustar00rootroot00000000000000// Test len/lenof/size/sizeof. externals (stem) integers (expected_size) strings (s) stringescapes {} define stem as ( = '' $(len == 0) or fail='len empty incorrect' $(size == 0) or fail='size empty incorrect' = 'test' $(len == 4) or fail='len incorrect' $(size == 4) or fail='size incorrect' $(lenof '' == 0) or fail='lenof empty literal incorrect' $(sizeof '' == 0) or fail='sizeof empty literal incorrect' $(lenof 'test' == 4) or fail='lenof literal incorrect' $(sizeof 'test' == 4) or fail='sizeof literal incorrect' $s = '' $(lenof s == 0) or fail='lenof empty string variable incorrect' $(sizeof s == 0) or fail='sizeof empty string variable incorrect' $s = 'test' $(lenof s == 4) or fail='lenof string variable incorrect' $(sizeof s == 4) or fail='sizeof string variable incorrect' // Probe if character representation is UTF-8. ( // UTF-8. $(sizeof '{U+00A0}' == 2) $expected_size = 8 ) or ( $(sizeof '{U+00A0}' == 1) $expected_size = 7 ) or fail='sizeof U+00A0 unexpected' = '{U+00E9}preuve' $(len == 7) or fail='Unicode len incorrect' $(lenof '{U+00E9}preuve' == 7) or fail='Unicode lenof literal incorrect' $s = '{U+00E9}preuve' $(lenof s == 7) or fail='Unicode lenof string variable incorrect' = '{U+00E9}preuve' $(size == expected_size) or fail='Unicode size incorrect' $(sizeof '{U+00E9}preuve' == expected_size) or fail='Unicode sizeof literal incorrect' $s = '{U+00E9}preuve' $(sizeof s == expected_size) or fail='Unicode sizeof string variable incorrect' ='ok' ) snowball-3.1.0/tests/runtime/slice.sbl000066400000000000000000000022561520373054300177710ustar00rootroot00000000000000// Test slice-related operations. externals (stem) strings (s) stringescapes {} define stem as ( test ([ tolimit ] <-'abcd') $(size == 4) or fail='<- replacement did not work as expected' test ([ tolimit ] <-'ok') $(size == 2) or fail='<- replacement did not work as expected (2)' test ([ tolimit ] <-'abcd' $(cursor == 4)) or fail='<- replacement did not adjust cursor as expected' test ([ tolimit ] <-'ok' $(cursor == 2)) or fail='<- replacement did not adjust cursor as expected (2)' test ([ tolimit ]) <-'abcd' $(cursor == 0) or fail='<- replacement did not adjust cursor as expected (3)' test ([ tolimit ]) <-'ok' $(cursor == 0) or fail='<- replacement did not adjust cursor as expected (4)' test ([ next ] delete <-'boo') test ('book' atlimit) or fail='delete did not adjust slice as expected' test ([ next ] <-'ch' <-'outl') test ('outlook' atlimit) or fail='<- did not adjust slice as expected' test (='abc' [ next ] ->s delete <-s) test ('abc' atlimit) or fail='stored slice did not survive later mutation' // Regression test for https://github.com/snowballstem/snowball/issues/242 [] ->s $(sizeof s == 0) or fail='-> on empty slice was not empty' ='ok' ) snowball-3.1.0/tests/runtime/stringdollar.sbl000066400000000000000000000035411520373054300213740ustar00rootroot00000000000000// Test string-$. externals (stem) strings (s s2) stringescapes {} define stem as ( $s='' $(lenof s == 0) or fail='setting s empty failed' $s='xyz' $(lenof s == 3) or fail='setting s failed' test ('ok' atlimit) or fail='setting s changed current string' test (next $s hop 2 $(cursor == 1)) or fail='cursor not restored after string-$' test (next $s hop 2 $(limit == 2)) or fail='limit not restored after string-$' test (next $s $(cursor == 0)) or fail='cursor not zeroed inside string-$' test (next $s $(limit == 3)) or fail='limit not reset inside string-$' // Test things are reset on failure (regression test for Dart, Go, Rust). (not $s fail ='news' test ('ok' atlimit)) or fail='current string not restored after failing string-$' test (next not $s fail (='news' next) $(cursor == 1)) or fail='cursor not restored after failing string-$' (not $s fail (='news' next) $(limit == 2)) or fail='limit not restored after failing string-$' // Test things are reset on failure with nested use (regression test for // everything, though for C the only problem was -Wshadow warnings). (not $s2 $s fail ='news' test ('ok' atlimit)) or fail='current string not restored after failing string-$' test (next not $s2 $s fail (='news' next) $(cursor == 1)) or fail='cursor not restored after failing string-$' (not $s2 $s fail (='news' next) $(limit == 2)) or fail='limit not restored after failing string-$' (not $s2 $s (='news' $(cursor == 99)) test ('ok' atlimit)) or fail='current string not restored after failing string-$' test (next not $s2 $s (='news' next $(cursor == 99)) $(cursor == 1)) or fail='cursor not restored after failing string-$' (not $s2 $s (='news' next $(cursor == 99)) $(limit == 2)) or fail='limit not restored after failing string-$' // Test successful nested use. $s2 (='x' $s='y') or fail='nested string-$ failed' ) snowball-3.1.0/tests/runtime/strings.sbl000066400000000000000000000050011520373054300203520ustar00rootroot00000000000000// Test strings externals (stem) strings (s) define stem as ( '' or fail='failed to match empty string' test 'o' or fail='failed to match 1-char string' test ('o' $(cursor == 1)) or fail='failed to update cursor after 1-char string match' test 'ok' or fail='failed to match string' test ('ok' $(cursor == 2)) or fail='failed to update cursor after string match' test ('ok' $(cursor == 2) atlimit) or fail='atlimit failed after string match' not 'x' or fail='incorrectly matched single character string' not 'ko' or fail='incorrectly matched string' $(cursor == 0) or fail='cursor not restored after string literal tests' $s='' s or fail='failed to match empty string var' $s='o' test s or fail='failed to match 1-char string var' test (s $(cursor == 1)) or fail='failed to update cursor after 1-char string var match' $s='ok' test s or fail='failed to match string var' test (s $(cursor == 2)) or fail='failed to update cursor after string var match' test (s $(cursor == 2) atlimit) or fail='atlimit failed after string var match' $s='x' not s or fail='incorrectly matched single character string var' $s='ko' not s or fail='incorrectly matched string var' do ( ='test' ) $(cursor == 0) or fail='cursor not restored by do' backwards ( do ( ='test' ) $(cursor == size) or fail='cursor not restored by do' ) ='abcde' test ( $(len == 5) $(cursor == 0) 'abcde' atlimit ) or fail='string assignment wrong' setlimit hop 4 for test (next ='nkl') test ( $(len == 5) $(cursor == 0) 'ankle' atlimit ) or fail='string assignment does not replace c:l (same len)' setlimit hop 4 for test (next ='x') test ( $(len == 3) $(cursor == 0) 'axe' atlimit ) or fail='string assignment does not replace c:l (shorter)' setlimit hop 2 for test (next ='ustralopithecin') test ( $(len == 17) $(cursor == 0) 'australopithecine' atlimit ) or fail='string assignment does not replace c:l (longer)' // Test string literals are properly escaped for various target languages. $s='$xxx \x42 \u0042 \u{0042} \u000a {$xxx}' $(lenof s == 39) or fail='bad target language string literal escaping' // Test ASCII control characters in string literals stringescapes {} $s='{U+A}{U+1F}{U+7F}{U+0}x' $(lenof s == 5) or fail='ASCII control characters not handled' $s='a{U+80}{U+A0}b{U+00FF}' $(lenof s == 5) or fail='Unicode characters not handled' // Test that using a C close-comment sequence doesn't trip anything up. not '*/' or fail='C close comment sequence mishandled' ='ok' ) snowball-3.1.0/tests/runtime/test.sbl000066400000000000000000000005471520373054300176520ustar00rootroot00000000000000// Test `test`. externals (stem) strings (s) define stem as ( backwards ( $(cursor == 2) or fail='backwards failed to swap cursor and limit' test ( ='juggernaut' $(cursor == 10) or fail='string= failed to update cursor' next or fail='next failed' ) $(cursor == 10) or fail='test failed to restore cursor' ) ='ok' ) snowball-3.1.0/tests/stemtest.c000066400000000000000000000113171520373054300165170ustar00rootroot00000000000000/* Unit tests for handling of cases the vocabularies don't cover. * * In general we test the stemming algorithms via vocabulary lists * and corresponding lists of expected stems, which are in the snowball-data * repo. These are used to verify that each stemmer produces the expected * output for all the different programming languages Snowball can generate * code for (whereas testcases here only exercise the C stemmers). * * Test coverage for a change to a stemmer's behaviour should generally be * implemented by adding new words its the vocabulary (if words already there * don't adequately cover the change). * * Only cases which can't be exercised by real words in the language being * stemmed should go here. Each vocabulary lists also serves as a sample * vocabulary for evalutating its corresponding stemmer, so we don't want to * add entries which would be junk for such use (such as emoji, numbers, hex * codes, etc). * * Testcases here can run for a specified stemmer or for all stemmers. This * provides an easy way to test behaviours which any good stemmer should * have (such as not mangling numbers). */ #include #include #include /* for strlen, memcmp */ #include "libstemmer.h" #define U_0622 "\xd8\xa2" #define U_0627 "\xd8\xa7" #define U_062B "\xd8\xab" #define U_0631 "\xd8\xb1" #define EMOJI_FACE_THROWING_A_KISS "\xf0\x9f\x98\x98" #define U_40079 "\xf1\x80\x81\xb9" static const struct testcase { /* Stemmer to use, or 0 to test with all stemmers */ const char * language; /* Character encoding (can be 0 for UTF-8) */ const char * charenc; /* Input string (0 marks end of list) */ const char * input; /* Expected output string (0 means same as input) */ const char * expect; } testcases[] = { // Regression tests for C support code bug decoding 4 byte UTF-8 sequences. // https://github.com/snowballstem/snowball/issues/138 { "en", 0, "a" EMOJI_FACE_THROWING_A_KISS "ing", "a" EMOJI_FACE_THROWING_A_KISS "e" }, { "en", 0, U_40079 "wing", 0 }, "a" EMOJI_FACE_THROWING_A_KISS "e" }, { "en", 0, U_40079 "wing", 0 }, // The Persian stemmer removes ASCII space inside a word. It shouldn't // appear there if our tokenisation recommendations are followed, and // it seems more appropriate to test here rather than adding instances // to persian/voc.txt. { "fa", 0, U_0622 U_062B " " U_0627 U_0631, U_0622 U_062B U_0627 U_0631, 0 }, // The Finnish stemmer used to damage numbers ending with two or more of // the same digit. Regression test, applied to all stemmers. // https://github.com/snowballstem/snowball/issues/66 { 0, 0, "00", 0 }, { 0, 0, "555", 0 }, { 0, 0, "999", 0 }, { 0, 0, "1899", 0 }, { 0, 0, "2000", 0 }, { 0, 0, "9999", 0 }, { 0, 0, "1000000000", 0 }, // The Danish stemmer used to damage a number at the end of a word. // Regression test, applied to all stemmers. // https://github.com/snowballstem/snowball/issues/81 { 0, 0, "space1999", 0 }, { 0, 0, "hal9000", 0 }, { 0, 0, "0x0e00", 0 }, { 0, 0, 0, 0 } }; static void run_testcase(const char * language, const struct testcase *test) { const char * charenc = test->charenc; const char * input = test->input; const char * expect = test->expect; struct sb_stemmer * stemmer = sb_stemmer_new(language, charenc); const sb_symbol * stemmed; int len; if (expect == NULL) expect = input; if (stemmer == 0) { if (charenc == NULL) { fprintf(stderr, "language `%s' not available for stemming\n", language); exit(1); } else { fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); exit(1); } } stemmed = sb_stemmer_stem(stemmer, (const unsigned char*)input, strlen(input)); if (stemmed == NULL) { fprintf(stderr, "Out of memory"); exit(1); } len = sb_stemmer_length(stemmer); if (len != (int)strlen(expect) || memcmp(stemmed, expect, len) != 0) { fprintf(stderr, "%s stemmer output for %s was %.*s not %s\n", language, input, len, stemmed, expect); exit(1); } sb_stemmer_delete(stemmer); } int main(int argc, char * argv[]) { const char ** all_languages = sb_stemmer_list(); const struct testcase * p; (void)argc; (void)argv; for (p = testcases; p->input; ++p) { const char * language = p->language; if (language) { run_testcase(language, p); } else { const char ** l; for (l = all_languages; *l; ++l) { run_testcase(*l, p); } } } return 0; } snowball-3.1.0/tests/syntax/000077500000000000000000000000001520373054300160265ustar00rootroot00000000000000snowball-3.1.0/tests/syntax/canon.sbl000066400000000000000000000005271520373054300176320ustar00rootroot00000000000000// Test canonicalisation of commands. externals (stem) integers (x y) define stem as ( $x = cursor setmark y try 'a' atmark x $(cursor == y) atlimit $(cursor >= limit) // `<-''` is the same as `delete`. // `hop 1` is the same as `next` [substring] among ( 'a' (delete next) 'b' (<-'' hop 1) ) $(x == y) ) snowball-3.1.0/tests/syntax/canon.stderr000066400000000000000000000000001520373054300203370ustar00rootroot00000000000000snowball-3.1.0/tests/syntax/canon.syntax000066400000000000000000000006301520373054300203730ustar00rootroot00000000000000define stem ( = x # cursor = y # cursor try literal 'a' == # name x cursor == # name y cursor >= # limit cursor >= # limit cursor [ substring ] among literal 'a' ( delete next literal 'b' ( delete next == # name y name x Function end snowball-3.1.0/tests/syntax/emptyprogram.sbl000066400000000000000000000001351520373054300212550ustar00rootroot00000000000000// This segfaulted in print_node_() with 3.0.1 because the program was NULL. stringescapes{} snowball-3.1.0/tests/syntax/emptyprogram.syntax000066400000000000000000000000001520373054300220120ustar00rootroot00000000000000snowball-3.1.0/tests/syntax/groupings.sbl000066400000000000000000000002371520373054300205470ustar00rootroot00000000000000// Test groupings. externals (stem) groupings (g) stringescapes {} define g 'r{U+EB}p{U+EB}{U+E5}tss' define stem as ( g non-g gopast g gopast non-g ) snowball-3.1.0/tests/syntax/legacy.sbl000066400000000000000000000003671520373054300200020ustar00rootroot00000000000000// Test legacy commands. externals (stem) strings (ch) stringescapes {} stringdef o" decimal '246' stringdef o^ hex 'f4' define stem as ( among ( (next) 'a' 'e' 'i' 'o' 'u' ( [next] => ch delete <+ ch ) ) ) snowball-3.1.0/tests/syntax/legacy.stderr000066400000000000000000000010111520373054300205100ustar00rootroot00000000000000syntax/legacy.sbl:5: warning: `decimal` is a legacy feature - use {U+1234} notation instead syntax/legacy.sbl:6: warning: `hex` is a legacy feature - use {U+1234} notation instead syntax/legacy.sbl:12: warning: Use of `=>` is not recommended, see https://snowballstem.org/compiler/snowman.html section 13.3 for details syntax/legacy.sbl:14: warning: `<+` is a legacy feature - use `insert` instead syntax/legacy.sbl:9: warning: among starter is a legacy feature - put starter code between `substring` and `among` instead snowball-3.1.0/tests/syntax/legacy.syntax000066400000000000000000000003741520373054300205460ustar00rootroot00000000000000define stem ( substring ( next among literal 'a' literal 'e' literal 'i' literal 'o' literal 'u' ( [ next ] => ch delete insert ch Function end snowball-3.1.0/tests/syntax/loops.sbl000066400000000000000000000003521520373054300176640ustar00rootroot00000000000000// Test loop commands externals (stem) define stem as ( atleast 0 'x' atleast -1 'y' loop 1 'i' repeat 'y' try atleast len () try atleast len true try atleast len false try repeat () try repeat true repeat false ) snowball-3.1.0/tests/syntax/loops.stderr000066400000000000000000000011561520373054300204120ustar00rootroot00000000000000syntax/loops.sbl:4: warning: atleast 0 C is just repeat C syntax/loops.sbl:5: warning: atleast -1 C is just repeat C syntax/loops.sbl:6: warning: loop 1 C is just C syntax/loops.sbl:8: warning: infinite loop: body of 'atleast' always signals 't' syntax/loops.sbl:9: warning: infinite loop: body of 'atleast' always signals 't' syntax/loops.sbl:10: warning: body of 'atleast' always signals 'f' syntax/loops.sbl:11: warning: infinite loop: body of 'repeat' always signals 't' syntax/loops.sbl:12: warning: infinite loop: body of 'repeat' always signals 't' syntax/loops.sbl:13: warning: body of 'repeat' always signals 'f' snowball-3.1.0/tests/syntax/loops.syntax000066400000000000000000000005251520373054300204340ustar00rootroot00000000000000define stem ( repeat literal 'x' repeat literal 'y' literal 'i' repeat literal 'y' try atleast # len ( try atleast # len true try ( false try repeat ( try repeat true do false Function end snowball-3.1.0/tests/syntax/noops.sbl000066400000000000000000000030751520373054300176730ustar00rootroot00000000000000// Test no-op commands which should get canonicalised. externals (stem) integers (x y) booleans (b) strings (s) define stem as ( $x = 0 [substring] among ( // All actions equivalent to `true`. '0' () 'a' (()) 'b' (true) 'c' ('') 'd' (loop 0 next) 'e' (loop -1 next) 'f' (hop 0) 'g' ($x += 0) 'h' ($x -= 0) 'i' ($x *= 1) 'j' ($x /= 1) 'k' ($x = x) 'l' (atmark cursor) 'm' (tomark cursor) 'n' (among ('')) 'o' (substring among ('')) 'p' (among ('' (atmark cursor))) 'q' (substring atmark cursor among ('')) 'r' (substring atmark cursor among ('' (atmark cursor))) 's' (insert '') 't' (attach '') // Values set are unused. 'aa' ($y = 1) 'ab' ($y += 1) 'ac' ($y -= 1) 'ad' ($y *= 2) 'ae' ($y /= 2) 'af' (setmark y) 'ag' (set b) 'ah' (unset b) 'ai' (-> s) // Integer tests. 'A' ($(1 == 1)) 'B' ($(2 > 1)) 'C' ($(-1 < 0)) 'D' ($(-1 <= -1)) 'E' ($(0 >= 0)) 'F' ($(1 != 0)) ) // `setlimit tomark limit for X` is equivalent to just `X`. setlimit tomark limit for 'X' [substring] among ( // All actions equivalent to `false`. '0' (false) 'a' (fail ()) 'b' (fail true) 'c' (fail '') 'd' (fail loop 0 next) 'e' (fail loop -1 next) 'f' (fail hop 0) 'g' (fail $x += 0) 'h' (fail $x -= 0) 'i' (fail $x *= 1) 'j' (fail $x /= 1) // Integer tests which are always false. 'A' ($(1 != 1)) 'B' ($(1 > 2)) 'C' ($(-1 > 0)) 'D' ($(0 <= -1)) 'E' ($(0 >= 1)) 'F' ($(1 == 0)) ) ) snowball-3.1.0/tests/syntax/noops.stderr000066400000000000000000000031561520373054300204160ustar00rootroot00000000000000syntax/noops.sbl:12: warning: empty literal string is a no-op syntax/noops.sbl:13: warning: `loop 0 C` is a no-op syntax/noops.sbl:14: warning: `loop -1 C` is a no-op syntax/noops.sbl:15: warning: `hop 0` is a no-op syntax/noops.sbl:20: warning: `$x = x` is a no-op syntax/noops.sbl:21: warning: `atmark cursor` is always true syntax/noops.sbl:22: warning: `tomark cursor` is a no-op syntax/noops.sbl:23: warning: `among` with only empty string always matches syntax/noops.sbl:24: warning: `among` with only empty string always matches syntax/noops.sbl:25: warning: `atmark cursor` is always true syntax/noops.sbl:25: warning: `among` with only empty string always matches syntax/noops.sbl:26: warning: `atmark cursor` is always true syntax/noops.sbl:26: warning: `among` with only empty string always matches syntax/noops.sbl:27: warning: `atmark cursor` is always true syntax/noops.sbl:27: warning: `atmark cursor` is always true syntax/noops.sbl:27: warning: `among` with only empty string always matches syntax/noops.sbl:28: warning: `insert ''` is a no-op syntax/noops.sbl:29: warning: `attach ''` is a no-op syntax/noops.sbl:52: warning: `setlimit tomark limit` is a no-op syntax/noops.sbl:58: warning: empty literal string is a no-op syntax/noops.sbl:59: warning: `loop 0 C` is a no-op syntax/noops.sbl:60: warning: `loop -1 C` is a no-op syntax/noops.sbl:61: warning: `hop 0` is a no-op syntax/noops.sbl:5: warning: string 's' is set but never used syntax/noops.sbl:4: warning: boolean 'b' is set but never used syntax/noops.sbl:3: warning: integer 'y' is set but never used syntax/noops.sbl:3: warning: integer 'x' is set but never used snowball-3.1.0/tests/syntax/noops.syntax000066400000000000000000000033121520373054300204330ustar00rootroot00000000000000define stem ( true [ substring ] among literal '0' ( literal 'a' ( literal 'b' ( literal 'c' ( literal 'd' ( literal 'e' ( literal 'f' ( literal 'g' ( literal 'h' ( literal 'i' ( literal 'j' ( literal 'k' ( literal 'l' ( literal 'm' ( literal 'n' ( literal 'o' ( literal 'p' ( literal 'q' ( literal 'r' ( literal 's' ( literal 't' ( literal 'aa' ( literal 'ab' ( literal 'ac' ( literal 'ad' ( literal 'ae' ( literal 'af' ( literal 'ag' ( literal 'ah' ( literal 'ai' ( literal 'A' ( literal 'B' ( literal 'C' ( literal 'D' ( literal 'E' ( literal 'F' ( literal 'X' [ substring ] among literal '0' ( false literal 'a' ( false literal 'b' ( false literal 'c' ( false literal 'd' ( false literal 'e' ( false literal 'f' ( false literal 'g' ( false literal 'h' ( false literal 'i' ( false literal 'j' ( false literal 'A' ( false literal 'B' ( false literal 'C' ( false literal 'D' ( false literal 'E' ( false literal 'F' ( false snowball-3.1.0/tests/syntax/simplifyae.sbl000066400000000000000000000004401520373054300206700ustar00rootroot00000000000000// Test normalisation of arithmetic expressions (AEs). integers(x y) externals(stem) define stem as ( $y = size * 0 $x = size $x *= 0 $x *= -1 $y += x $y /= -1 $x = size * -1 $y += x $x = -1 * size $y += x $x = size / -1 $y += x $y -= -1 $y += -1 $y == 1 ) snowball-3.1.0/tests/syntax/simplifyae.stderr000066400000000000000000000000001520373054300214030ustar00rootroot00000000000000snowball-3.1.0/tests/syntax/simplifyae.syntax000066400000000000000000000006421520373054300214420ustar00rootroot00000000000000define stem ( = y # number 0 = x # size = x # number 0 = x # neg name x += y # name x = y # neg name y = x # neg size += y # name x = x # neg size += y # name x = x # neg size += y # name x += y # number 1 -= y # number 1 == # number 1 name y Function end snowball-3.1.0/tests/syntax/unused.sbl000066400000000000000000000004741520373054300200400ustar00rootroot00000000000000// Test unused variables, routines and groupings. externals (stem) integers (x unused_int) booleans (b unused_bool) strings (unused string) routines (r undef_routine) groupings (g undef_grouping) define g 'aeiou' define r as ( unset b try (next set b) $x = size b $(x > 2) g ) define stem as true snowball-3.1.0/tests/syntax/unused.stderr000066400000000000000000000007721520373054300205640ustar00rootroot00000000000000syntax/unused.sbl:7: warning: grouping 'undef_grouping' declared but not defined syntax/unused.sbl:6: warning: routine 'undef_routine' declared but not defined syntax/unused.sbl:9: warning: routine 'r' defined but not used syntax/unused.sbl:5: warning: string 'string' declared but not used syntax/unused.sbl:5: warning: string 'unused' declared but not used syntax/unused.sbl:4: warning: boolean 'unused_bool' declared but not used syntax/unused.sbl:3: warning: integer 'unused_int' declared but not used snowball-3.1.0/tests/syntax/unused.syntax000066400000000000000000000000421520373054300205750ustar00rootroot00000000000000define stem true Function end snowball-3.1.0/zig/000077500000000000000000000000001520373054300141275ustar00rootroot00000000000000snowball-3.1.0/zig/env.zig000066400000000000000000000453521520373054300154430ustar00rootroot00000000000000const std = @import("std"); pub const MaxInt = std.math.maxInt(i32); pub const MinInt = std.math.minInt(i32); pub const Among = struct { s: []const u8, substring_i: i32, result: i32, method: ?*const fn (*Env, *anyopaque) bool, }; pub const String = struct { value: []u8 = &.{}, pub fn deinit(self: *String, allocator: std.mem.Allocator) void { if (self.value.len > 0) { allocator.free(self.value); } self.value = &.{}; } pub fn assign(self: *String, allocator: std.mem.Allocator, s: []const u8) !void { const new_value = try dupeOrEmpty(allocator, s); self.deinit(allocator); self.value = new_value; } pub fn slice(self: *const String) []const u8 { return self.value; } }; pub const Env = struct { current: []u8, cursor: usize, limit: usize, limit_backward: usize, bra: usize, ket: usize, allocator: std.mem.Allocator, pub fn init(allocator: std.mem.Allocator) Env { return .{ .current = &.{}, .cursor = 0, .limit = 0, .limit_backward = 0, .bra = 0, .ket = 0, .allocator = allocator, }; } pub fn deinit(self: *Env) void { if (self.current.len > 0) { self.allocator.free(self.current); } } pub fn setCurrent(self: *Env, s: []const u8) !void { const new_current = try dupeOrEmpty(self.allocator, s); if (self.current.len > 0) { self.allocator.free(self.current); } self.current = new_current; self.cursor = 0; self.limit = s.len; self.limit_backward = 0; self.bra = 0; self.ket = s.len; } pub fn getCurrent(self: *const Env) []const u8 { return self.current[0..self.limit]; } pub fn replaceS(self: *Env, bra_arg: usize, ket_arg: usize, s: []const u8) !i32 { const adjustment: i32 = @as(i32, @intCast(s.len)) - (@as(i32, @intCast(ket_arg)) - @as(i32, @intCast(bra_arg))); // Always build a new buffer: current[0..bra] + s + current[rsplit..] // Must include content beyond limit (setlimit may be active). const rsplit = if (ket_arg < bra_arg) bra_arg else ket_arg; const tail_len = self.current.len - rsplit; const new_len = bra_arg + s.len + tail_len; const new_buf = try allocOrEmpty(self.allocator, new_len); std.mem.copyForwards(u8, new_buf[0..bra_arg], self.current[0..bra_arg]); std.mem.copyForwards(u8, new_buf[bra_arg..][0..s.len], s); std.mem.copyForwards(u8, new_buf[bra_arg + s.len ..][0..tail_len], self.current[rsplit..][0..tail_len]); if (self.current.len > 0) { self.allocator.free(self.current); } self.current = new_buf; self.limit = adjustIndex(self.limit, adjustment); if (self.cursor >= ket_arg) { self.cursor = adjustIndex(self.cursor, adjustment); } else if (self.cursor > bra_arg) { self.cursor = bra_arg; } return adjustment; } pub fn eqS(self: *Env, s: []const u8) bool { if (self.cursor >= self.limit) return false; if (self.cursor + s.len > self.limit) return false; if (!std.mem.eql(u8, self.current[self.cursor..][0..s.len], s)) return false; self.cursor += s.len; return true; } pub fn eqSB(self: *Env, s: []const u8) bool { if (@as(i32, @intCast(self.cursor)) - @as(i32, @intCast(self.limit_backward)) < @as(i32, @intCast(s.len))) return false; if (!std.mem.eql(u8, self.current[self.cursor - s.len ..][0..s.len], s)) return false; self.cursor -= s.len; return true; } pub fn sliceFrom(self: *Env, s: []const u8) !void { const bra_val = self.bra; _ = try self.replaceS(bra_val, self.ket, s); self.ket = bra_val + s.len; } pub fn sliceDel(self: *Env) !void { try self.sliceFrom(""); } pub fn insert(self: *Env, bra_arg: usize, ket_arg: usize, s: []const u8) !void { const adjustment = try self.replaceS(bra_arg, ket_arg, s); if (bra_arg <= self.bra) { self.bra = adjustIndex(self.bra, adjustment); } if (bra_arg <= self.ket) { self.ket = adjustIndex(self.ket, adjustment); } } pub fn sliceTo(self: *const Env) []const u8 { return self.current[self.bra..self.ket]; } pub fn assignTo(self: *const Env) []const u8 { return self.getCurrent(); } pub fn nextChar(self: *Env) void { self.cursor += 1; while (self.cursor < self.current.len and !onCharBoundary(self.current, self.cursor)) { self.cursor += 1; } } pub fn prevChar(self: *Env) void { self.cursor -= 1; while (self.cursor > 0 and !onCharBoundary(self.current, self.cursor)) { self.cursor -= 1; } } pub fn hop(self: *Env, delta: i32) bool { var d = delta; var res = self.cursor; while (d > 0) { d -= 1; if (res >= self.limit) return false; res += 1; while (res < self.limit and !onCharBoundary(self.current, res)) { res += 1; } } self.cursor = res; return true; } pub fn hopChecked(self: *Env, delta: i32) bool { return delta >= 0 and self.hop(delta); } pub fn hopBack(self: *Env, delta: i32) bool { var d = delta; var res = self.cursor; while (d > 0) { d -= 1; if (res <= self.limit_backward) return false; res -= 1; while (res > self.limit_backward and !onCharBoundary(self.current, res)) { res -= 1; } } self.cursor = res; return true; } pub fn hopBackChecked(self: *Env, delta: i32) bool { return delta >= 0 and self.hopBack(delta); } pub fn inGrouping(self: *Env, chars: []const u8, min_ch: i32, max_ch: i32) bool { if (self.cursor >= self.limit) return false; const r = decodeRune(self.current[self.cursor..]) orelse return false; if (r > max_ch or r < min_ch) return false; const idx = r - min_ch; if (!inBitmap(chars, idx)) return false; self.nextChar(); return true; } pub fn inGroupingB(self: *Env, chars: []const u8, min_ch: i32, max_ch: i32) bool { if (self.cursor <= self.limit_backward) return false; const c = self.cursor; self.prevChar(); const r = decodeRune(self.current[self.cursor..]) orelse { self.cursor = c; return false; }; if (r > max_ch or r < min_ch) { self.cursor = c; return false; } const idx = r - min_ch; if (!inBitmap(chars, idx)) { self.cursor = c; return false; } return true; } pub fn outGrouping(self: *Env, chars: []const u8, min_ch: i32, max_ch: i32) bool { if (self.cursor >= self.limit) return false; const r = decodeRune(self.current[self.cursor..]) orelse return false; if (r > max_ch or r < min_ch) { self.nextChar(); return true; } const idx = r - min_ch; if (!inBitmap(chars, idx)) { self.nextChar(); return true; } return false; } pub fn outGroupingB(self: *Env, chars: []const u8, min_ch: i32, max_ch: i32) bool { if (self.cursor <= self.limit_backward) return false; const c = self.cursor; self.prevChar(); const r = decodeRune(self.current[self.cursor..]) orelse { self.cursor = c; return false; }; if (r > max_ch or r < min_ch) return true; const idx = r - min_ch; if (!inBitmap(chars, idx)) return true; self.cursor = c; return false; } pub fn goInGrouping(self: *Env, chars: []const u8, min_ch: i32, max_ch: i32) bool { while (self.cursor < self.limit) { const r = decodeRune(self.current[self.cursor..]) orelse return false; if (r > max_ch or r < min_ch) return true; const idx = r - min_ch; if (!inBitmap(chars, idx)) return true; self.nextChar(); } return false; } pub fn goInGroupingB(self: *Env, chars: []const u8, min_ch: i32, max_ch: i32) bool { while (self.cursor > self.limit_backward) { const c = self.cursor; self.prevChar(); const r = decodeRune(self.current[self.cursor..]) orelse return false; if (r > max_ch or r < min_ch) { self.cursor = c; return true; } const idx = r - min_ch; if (!inBitmap(chars, idx)) { self.cursor = c; return true; } } return false; } pub fn goOutGrouping(self: *Env, chars: []const u8, min_ch: i32, max_ch: i32) bool { while (self.cursor < self.limit) { const r = decodeRune(self.current[self.cursor..]) orelse return false; if (r <= max_ch and r >= min_ch) { const idx = r - min_ch; if (inBitmap(chars, idx)) return true; } self.nextChar(); } return false; } pub fn goOutGroupingB(self: *Env, chars: []const u8, min_ch: i32, max_ch: i32) bool { while (self.cursor > self.limit_backward) { const c = self.cursor; self.prevChar(); const r = decodeRune(self.current[self.cursor..]) orelse return false; if (r <= max_ch and r >= min_ch) { const idx = r - min_ch; if (inBitmap(chars, idx)) { self.cursor = c; return true; } } } return false; } pub fn findAmong(self: *Env, amongs: []const Among, ctx: *anyopaque) i32 { var i: i32 = 0; var j: i32 = @intCast(amongs.len); const c = self.cursor; const l = self.limit; var common_i: usize = 0; var common_j: usize = 0; var first_key_inspected = false; while (true) { const k: usize = @intCast(i + @divTrunc(j - i, 2)); var diff: i32 = 0; var common = @min(common_i, common_j); const w = amongs[k]; var lvar: usize = common; while (lvar < w.s.len) : (lvar += 1) { if (c + common == l) { diff -= 1; break; } diff = @as(i32, @intCast(self.current[c + common])) - @as(i32, @intCast(w.s[lvar])); if (diff != 0) break; common += 1; } if (diff < 0) { j = @intCast(k); common_j = common; } else { i = @intCast(k); common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } while (true) { const w = amongs[@intCast(i)]; if (common_i >= w.s.len) { self.cursor = c + w.s.len; if (w.method) |method| { if (method(self, ctx)) { self.cursor = c + w.s.len; return w.result; } } else { return w.result; } } i = w.substring_i; if (i < 0) return 0; } } pub fn findAmongB(self: *Env, amongs: []const Among, ctx: *anyopaque) i32 { var i: i32 = 0; var j: i32 = @intCast(amongs.len); const c = self.cursor; const lb = self.limit_backward; var common_i: usize = 0; var common_j: usize = 0; var first_key_inspected = false; while (true) { const k: usize = @intCast(i + @divTrunc(j - i, 2)); var diff: i32 = 0; var common = @min(common_i, common_j); const w = amongs[k]; { var lvar_signed: i32 = @as(i32, @intCast(w.s.len)) - @as(i32, @intCast(common)) - 1; while (lvar_signed >= 0) : (lvar_signed -= 1) { const lvar: usize = @intCast(lvar_signed); if (c - common == lb) { diff -= 1; break; } diff = @as(i32, @intCast(self.current[c - common - 1])) - @as(i32, @intCast(w.s[lvar])); if (diff != 0) break; common += 1; } } if (diff < 0) { j = @intCast(k); common_j = common; } else { i = @intCast(k); common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } while (true) { const w = amongs[@intCast(i)]; if (common_i >= w.s.len) { self.cursor = c - w.s.len; if (w.method) |method| { if (method(self, ctx)) { self.cursor = c - w.s.len; return w.result; } } else { return w.result; } } i = w.substring_i; if (i < 0) return 0; } } pub fn clone(self: *const Env) !Env { var c = self.*; c.current = try dupeOrEmpty(self.allocator, self.current); return c; } pub fn copyFrom(self: *Env, other: *const Env) !void { if (self == other) return; const new_current = try dupeOrEmpty(self.allocator, other.current); if (self.current.len > 0) { self.allocator.free(self.current); } self.current = new_current; self.cursor = other.cursor; self.limit = other.limit; self.limit_backward = other.limit_backward; self.bra = other.bra; self.ket = other.ket; } pub fn debug(self: *const Env, count: i32, line_number: i32) void { _ = self; std.log.debug("snowball debug, count: {d}, line: {d}", .{ count, line_number }); } }; fn allocOrEmpty(allocator: std.mem.Allocator, len: usize) ![]u8 { if (len == 0) return &.{}; return allocator.alloc(u8, len); } fn dupeOrEmpty(allocator: std.mem.Allocator, s: []const u8) ![]u8 { if (s.len == 0) return &.{}; return allocator.dupe(u8, s); } fn adjustIndex(value: usize, adjustment: i32) usize { if (adjustment >= 0) { return value + @as(usize, @intCast(adjustment)); } return value - @as(usize, @intCast(-adjustment)); } fn inBitmap(chars: []const u8, idx: i32) bool { const u_idx: u32 = @intCast(idx); return (chars[u_idx >> 3] & (@as(u8, 1) << @intCast(@as(u3, @truncate(u_idx))))) != 0; } fn onCharBoundary(s: []const u8, pos: usize) bool { if (pos == 0 or pos >= s.len) return true; // A byte is a UTF-8 start byte if its top two bits are not 10xxxxxx. return (s[pos] & 0xC0) != 0x80; } fn decodeRune(s: []const u8) ?i32 { if (s.len == 0) return null; const b0 = s[0]; if (b0 < 0x80) return @intCast(b0); if (b0 < 0xC0) return null; // continuation byte if (b0 < 0xE0) { if (s.len < 2) return null; return (@as(i32, b0 & 0x1F) << 6) | @as(i32, s[1] & 0x3F); } if (b0 < 0xF0) { if (s.len < 3) return null; return (@as(i32, b0 & 0x0F) << 12) | (@as(i32, s[1] & 0x3F) << 6) | @as(i32, s[2] & 0x3F); } if (s.len < 4) return null; return (@as(i32, b0 & 0x07) << 18) | (@as(i32, s[1] & 0x3F) << 12) | (@as(i32, s[2] & 0x3F) << 6) | @as(i32, s[3] & 0x3F); } pub fn runeCountInString(s: []const u8) i32 { var count: i32 = 0; var i: usize = 0; while (i < s.len) { if (s[i] < 0x80) { i += 1; } else if (s[i] < 0xE0) { i += 2; } else if (s[i] < 0xF0) { i += 3; } else { i += 4; } count += 1; } return count; } test "setCurrent preserves existing state on allocation failure" { var failing_allocator = std.testing.FailingAllocator.init(std.testing.allocator, .{ .fail_index = 1, }); var env = Env.init(failing_allocator.allocator()); defer env.deinit(); try env.setCurrent("abc"); env.cursor = 1; env.limit_backward = 1; env.bra = 1; env.ket = 2; try std.testing.expectError(error.OutOfMemory, env.setCurrent("wxyz")); try std.testing.expectEqualStrings("abc", env.current); try std.testing.expectEqual(@as(usize, 1), env.cursor); try std.testing.expectEqual(@as(usize, 3), env.limit); try std.testing.expectEqual(@as(usize, 1), env.limit_backward); try std.testing.expectEqual(@as(usize, 1), env.bra); try std.testing.expectEqual(@as(usize, 2), env.ket); } test "copyFrom preserves existing state on allocation failure" { var failing_allocator = std.testing.FailingAllocator.init(std.testing.allocator, .{ .fail_index = 2, }); const allocator = failing_allocator.allocator(); var env = Env.init(allocator); defer env.deinit(); try env.setCurrent("abc"); env.cursor = 1; env.limit_backward = 1; env.bra = 1; env.ket = 2; var other = Env.init(allocator); defer other.deinit(); try other.setCurrent("wxyz"); other.cursor = 3; other.limit_backward = 1; other.bra = 2; other.ket = 4; try std.testing.expectError(error.OutOfMemory, env.copyFrom(&other)); try std.testing.expectEqualStrings("abc", env.current); try std.testing.expectEqual(@as(usize, 1), env.cursor); try std.testing.expectEqual(@as(usize, 3), env.limit); try std.testing.expectEqual(@as(usize, 1), env.limit_backward); try std.testing.expectEqual(@as(usize, 1), env.bra); try std.testing.expectEqual(@as(usize, 2), env.ket); } test "copyFrom supports self copy" { var env = Env.init(std.testing.allocator); defer env.deinit(); try env.setCurrent("abc"); env.cursor = 1; env.limit_backward = 1; env.bra = 1; env.ket = 2; try env.copyFrom(&env); try std.testing.expectEqualStrings("abc", env.current); try std.testing.expectEqual(@as(usize, 1), env.cursor); try std.testing.expectEqual(@as(usize, 3), env.limit); try std.testing.expectEqual(@as(usize, 1), env.limit_backward); try std.testing.expectEqual(@as(usize, 1), env.bra); try std.testing.expectEqual(@as(usize, 2), env.ket); } snowball-3.1.0/zig/generate_algorithms.pl000077500000000000000000000012601520373054300205110ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; my @algorithms = @ARGV; print("// Generated by generate_algorithms.pl - DO NOT EDIT\n\n"); print("const snowball = \@import(\"env.zig\");\n\n"); foreach my $alg (@algorithms) { # Suffix constant with `_stemmer` to avoid generating bad code if # the algorithm name is a zig reserved word. print("const ${alg}_stemmer = \@import(\"${alg}_stemmer.zig\");\n"); } print("\npub const StemFn = *const fn (*snowball.Env) bool;\n\n"); print("pub const algorithms = [_]struct { name: []const u8, stem: StemFn }{\n"); foreach my $alg (@algorithms) { print(" .{ .name = \"$alg\", .stem = ${alg}_stemmer.stem },\n"); } print("};\n"); snowball-3.1.0/zig/stemwords.zig000066400000000000000000000103731520373054300166750ustar00rootroot00000000000000const std = @import("std"); const snowball = @import("env.zig"); const alg = @import("algorithms.zig"); fn findStemmer(name: []const u8) ?alg.StemFn { for (alg.algorithms) |a| { if (std.mem.eql(u8, a.name, name)) return a.stem; } return null; } fn readFile(io: std.Io, allocator: std.mem.Allocator, path: []const u8) ![]u8 { const file = try std.Io.Dir.cwd().openFile(io, path, .{}); defer file.close(io); const stat = try file.stat(io); const size: usize = @intCast(stat.size); const buf = try allocator.alloc(u8, size); errdefer allocator.free(buf); const n = try file.readPositionalAll(io, buf, 0); return buf[0..n]; } fn writeFile(io: std.Io, path: []const u8, data: []const u8) !void { const file = try std.Io.Dir.cwd().createFile(io, path, .{}); defer file.close(io); try file.writePositionalAll(io, data, 0); } pub fn main(init: std.process.Init) !void { const io = init.io; const allocator = init.gpa; var args_iter = std.process.Args.Iterator.init(init.minimal.args); _ = args_iter.next(); // skip program name var language: ?[]const u8 = null; var input_path: ?[]const u8 = null; var output_path: ?[]const u8 = null; while (args_iter.next()) |arg| { if (std.mem.eql(u8, arg, "-l")) { language = args_iter.next(); } else if (std.mem.eql(u8, arg, "-i")) { input_path = args_iter.next(); } else if (std.mem.eql(u8, arg, "-o")) { output_path = args_iter.next(); } } const lang = language orelse std.process.fatal("error: -l language required", .{}); const stem_fn = findStemmer(lang) orelse std.process.fatal("error: unknown language '{s}'", .{lang}); var env = snowball.Env.init(allocator); defer env.deinit(); if (input_path) |p| { // File input: read all, process, write all const input_data = try readFile(io, allocator, p); defer allocator.free(input_data); var output = std.ArrayList(u8).initCapacity(allocator, input_data.len) catch std.process.fatal("out of memory", .{}); defer output.deinit(allocator); var rest = input_data; while (std.mem.indexOfScalar(u8, rest, '\n')) |nl| { const line = rest[0..nl]; if (line.len > 0) { try env.setCurrent(line); _ = stem_fn(&env); output.appendSlice(allocator, env.getCurrent()) catch std.process.fatal("out of memory", .{}); } output.append(allocator, '\n') catch std.process.fatal("out of memory", .{}); rest = rest[nl + 1 ..]; } if (rest.len > 0) { try env.setCurrent(rest); _ = stem_fn(&env); output.appendSlice(allocator, env.getCurrent()) catch std.process.fatal("out of memory", .{}); output.append(allocator, '\n') catch std.process.fatal("out of memory", .{}); } if (output_path) |out_path| { try writeFile(io, out_path, output.items); } else { var write_buf: [8192]u8 = undefined; var writer = std.Io.File.stdout().writerStreaming(io, &write_buf); try writer.interface.writeAll(output.items); try writer.interface.flush(); } } else { // Stdin: process line by line, stream output var read_buf: [8192]u8 = undefined; var reader = std.Io.File.stdin().readerStreaming(io, &read_buf); const out_file = if (output_path) |out_path| try std.Io.Dir.cwd().createFile(io, out_path, .{}) else null; defer if (out_file) |f| f.close(io); var write_buf: [8192]u8 = undefined; var writer = if (out_file) |f| f.writerStreaming(io, &write_buf) else std.Io.File.stdout().writerStreaming(io, &write_buf); while (try reader.interface.takeDelimiter('\n')) |line| { if (line.len > 0) { try env.setCurrent(line); _ = stem_fn(&env); try writer.interface.writeAll(env.getCurrent()); } try writer.interface.writeByte('\n'); } try writer.interface.flush(); } }