pax_global_header00006660000000000000000000000064151642716640014526gustar00rootroot0000000000000052 comment=ae849b648bb334e9b5a9ce9d2613e9c1e7eaa61c ankane-neighbor-ae849b6/000077500000000000000000000000001516427166400152025ustar00rootroot00000000000000ankane-neighbor-ae849b6/.github/000077500000000000000000000000001516427166400165425ustar00rootroot00000000000000ankane-neighbor-ae849b6/.github/workflows/000077500000000000000000000000001516427166400205775ustar00rootroot00000000000000ankane-neighbor-ae849b6/.github/workflows/build.yml000066400000000000000000000025271516427166400224270ustar00rootroot00000000000000name: build on: [push, pull_request] jobs: build: strategy: fail-fast: false matrix: include: - ruby: "4.0" gemfile: Gemfile - ruby: 3.4 gemfile: gemfiles/activerecord80.gemfile - ruby: 3.3 gemfile: gemfiles/activerecord72.gemfile runs-on: ubuntu-latest env: BUNDLE_GEMFILE: ${{ matrix.gemfile }} services: mariadb: image: mariadb:11.8 env: MARIADB_ALLOW_EMPTY_ROOT_PASSWORD: 1 MARIADB_DATABASE: neighbor_test ports: - 3307:3306 mysql: image: mysql:9 env: MYSQL_ALLOW_EMPTY_PASSWORD: 1 MYSQL_DATABASE: neighbor_test ports: - 3306:3306 steps: - uses: actions/checkout@v5 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby }} bundler-cache: true - uses: ankane/setup-postgres@v1 with: database: neighbor_test dev-files: true - run: | cd /tmp git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git cd pgvector make sudo make install - run: bundle exec rake test - run: TEST_TRILOGY=1 bundle exec rake test:mariadb - run: TEST_TRILOGY=1 bundle exec rake test:mysql ankane-neighbor-ae849b6/.gitignore000066400000000000000000000001201516427166400171630ustar00rootroot00000000000000/.bundle/ /.yardoc /_yardoc/ /coverage/ /doc/ /pkg/ /spec/reports/ /tmp/ *.lock ankane-neighbor-ae849b6/CHANGELOG.md000066400000000000000000000054461516427166400170240ustar00rootroot00000000000000## 1.0.0 (2026-04-04) - Dropped support for Ruby < 3.3 and Active Record < 7.2 ## 0.6.0 (2025-06-12) - Added support for MariaDB 11.8 - Dropped experimental support for MariaDB 11.7 - Dropped support for Ruby < 3.2 and Active Record < 7.1 ## 0.5.2 (2025-01-05) - Improved support for Postgres arrays ## 0.5.1 (2024-12-03) - Added experimental support for MariaDB 11.7 - Dropped experimental support for MariaDB 11.6 Vector ## 0.5.0 (2024-10-07) - Added experimental support for SQLite (sqlite-vec) - Added experimental support for MariaDB 11.6 Vector - Added experimental support for MySQL 9 - Changed `normalize` option to use Active Record normalization - Fixed connection leasing for Active Record 7.2 - Dropped support for Active Record < 7 ## 0.4.3 (2024-09-02) - Added `rrf` method ## 0.4.2 (2024-08-27) - Fixed error with `nil` values ## 0.4.1 (2024-08-26) - Added `precision` option - Added support for `bit` dimensions to model generator - Fixed error with Numo arrays ## 0.4.0 (2024-06-25) - Added support for `halfvec` and `sparsevec` types - Added support for `taxicab`, `hamming`, and `jaccard` distances with `vector` extension - Added deserialization for `cube` and `vector` columns without `has_neighbor` - Added support for composite primary keys - Changed `nearest_neighbors` to replace previous `order` scopes - Changed `normalize` option to use `before_save` callback - Changed dimensions and finite values checks to use Active Record validations - Fixed issue with `nearest_neighbors` scope overriding `select` values - Removed default attribute name - Dropped support for Ruby < 3.1 ## 0.3.2 (2023-12-12) - Added deprecation warning for `has_neighbors` without an attribute name - Added deprecation warning for `nearest_neighbors` without an attribute name ## 0.3.1 (2023-09-25) - Added support for passing multiple attributes to `has_neighbors` - Fixed error with `nearest_neighbors` scope with Ruby 3.2 and Active Record 6.1 ## 0.3.0 (2023-07-24) - Dropped support for Ruby < 3 and Active Record < 6.1 ## 0.2.3 (2023-04-02) - Added support for dimensions to model generator ## 0.2.2 (2022-07-13) - Added support for configurable attribute name - Added support for multiple attributes per model ## 0.2.1 (2021-12-15) - Added support for Active Record 7 ## 0.2.0 (2021-04-21) - Added support for pgvector - Added `normalize` option - Made `dimensions` optional - Raise an error if `nearest_neighbors` already defined - Raise an error for non-finite values - Fixed NaN with zero vectors and cosine distance Breaking changes - The `distance` option has been moved from `has_neighbors` to `nearest_neighbors`, and there is no longer a default ## 0.1.2 (2021-02-21) - Added `nearest_neighbors` scope ## 0.1.1 (2021-02-16) - Fixed `Could not dump table` error ## 0.1.0 (2021-02-15) - First release ankane-neighbor-ae849b6/Gemfile000066400000000000000000000003231516427166400164730ustar00rootroot00000000000000source "https://rubygems.org" gemspec gem "rake" gem "minitest" gem "activerecord", "~> 8.1.0" gem "pg" gem "sqlite3" gem "sqlite-vec", platform: :mri gem "mysql2" gem "trilogy" gem "railties", require: false ankane-neighbor-ae849b6/LICENSE.txt000066400000000000000000000020731516427166400170270ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2021-2026 Andrew Kane Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ankane-neighbor-ae849b6/README.md000066400000000000000000000454431516427166400164730ustar00rootroot00000000000000# Neighbor Nearest neighbor search for Rails Supports: - Postgres (cube and pgvector) - MariaDB 11.8 - MySQL 9 (searching requires HeatWave) - experimental - SQLite (sqlite-vec) - experimental Also available for [Redis](https://github.com/ankane/neighbor-redis) and [S3 Vectors](https://github.com/ankane/neighbor-s3) [![Build Status](https://github.com/ankane/neighbor/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/neighbor/actions) ## Installation Add this line to your application’s Gemfile: ```ruby gem "neighbor" ``` ### For Postgres Neighbor supports two extensions: [cube](https://www.postgresql.org/docs/current/cube.html) and [pgvector](https://github.com/pgvector/pgvector). cube ships with Postgres, while pgvector supports more dimensions and approximate nearest neighbor search. For cube, run: ```sh rails generate neighbor:cube rails db:migrate ``` For pgvector, [install the extension](https://github.com/pgvector/pgvector#installation) and run: ```sh rails generate neighbor:vector rails db:migrate ``` ### For SQLite Add this line to your application’s Gemfile: ```ruby gem "sqlite-vec" ``` And run: ```sh rails generate neighbor:sqlite ``` ## Getting Started Create a migration ```ruby class AddEmbeddingToItems < ActiveRecord::Migration[8.1] def change # cube add_column :items, :embedding, :cube # pgvector, MariaDB, and MySQL add_column :items, :embedding, :vector, limit: 3 # dimensions # sqlite-vec add_column :items, :embedding, :binary end end ``` Add to your model ```ruby class Item < ApplicationRecord has_neighbors :embedding end ``` Update the vectors ```ruby item.update(embedding: [1.0, 1.2, 0.5]) ``` Get the nearest neighbors to a record ```ruby item.nearest_neighbors(:embedding, distance: "euclidean").first(5) ``` Get the nearest neighbors to a vector ```ruby Item.nearest_neighbors(:embedding, [0.9, 1.3, 1.1], distance: "euclidean").first(5) ``` Records returned from `nearest_neighbors` will have a `neighbor_distance` attribute ```ruby nearest_item = item.nearest_neighbors(:embedding, distance: "euclidean").first nearest_item.neighbor_distance ``` See the additional docs for: - [cube](#cube) - [pgvector](#pgvector) - [MariaDB](#mariadb) - [MySQL](#mysql) - [sqlite-vec](#sqlite-vec) Or check out some [examples](#examples) ## cube ### Distance Supported values are: - `euclidean` - `cosine` - `taxicab` - `chebyshev` For cosine distance with cube, vectors must be normalized before being stored. ```ruby class Item < ApplicationRecord has_neighbors :embedding, normalize: true end ``` For inner product with cube, see [this example](examples/disco/user_recs_cube.rb). ### Dimensions The `cube` type can have up to 100 dimensions by default. See the [Postgres docs](https://www.postgresql.org/docs/current/cube.html) for how to increase this. For cube, it’s a good idea to specify the number of dimensions to ensure all records have the same number. ```ruby class Item < ApplicationRecord has_neighbors :embedding, dimensions: 3 end ``` ## pgvector ### Distance Supported values are: - `euclidean` - `inner_product` - `cosine` - `taxicab` - `hamming` - `jaccard` ### Dimensions The `vector` type can have up to 16,000 dimensions, and vectors with up to 2,000 dimensions can be indexed. The `halfvec` type can have up to 16,000 dimensions, and half vectors with up to 4,000 dimensions can be indexed. The `bit` type can have up to 83 million dimensions, and bit vectors with up to 64,000 dimensions can be indexed. The `sparsevec` type can have up to 16,000 non-zero elements, and sparse vectors with up to 1,000 non-zero elements can be indexed. ### Indexing Add an approximate index to speed up queries. Create a migration with: ```ruby class AddIndexToItemsEmbedding < ActiveRecord::Migration[8.1] def change add_index :items, :embedding, using: :hnsw, opclass: :vector_l2_ops # or add_index :items, :embedding, using: :ivfflat, opclass: :vector_l2_ops end end ``` Use `:vector_cosine_ops` for cosine distance and `:vector_ip_ops` for inner product. Set the size of the dynamic candidate list with HNSW ```ruby Item.connection.execute("SET hnsw.ef_search = 100") ``` Or the number of probes with IVFFlat ```ruby Item.connection.execute("SET ivfflat.probes = 3") ``` ### Half-Precision Vectors Use the `halfvec` type to store half-precision vectors ```ruby class AddEmbeddingToItems < ActiveRecord::Migration[8.1] def change add_column :items, :embedding, :halfvec, limit: 3 # dimensions end end ``` ### Half-Precision Indexing Index vectors at half precision for smaller indexes ```ruby class AddIndexToItemsEmbedding < ActiveRecord::Migration[8.1] def change add_index :items, "(embedding::halfvec(3)) halfvec_l2_ops", using: :hnsw end end ``` Get the nearest neighbors ```ruby Item.nearest_neighbors(:embedding, [0.9, 1.3, 1.1], distance: "euclidean", precision: "half").first(5) ``` ### Binary Vectors Use the `bit` type to store binary vectors ```ruby class AddEmbeddingToItems < ActiveRecord::Migration[8.1] def change add_column :items, :embedding, :bit, limit: 3 # dimensions end end ``` Get the nearest neighbors by Hamming distance ```ruby Item.nearest_neighbors(:embedding, "101", distance: "hamming").first(5) ``` ### Binary Quantization Use expression indexing for binary quantization ```ruby class AddIndexToItemsEmbedding < ActiveRecord::Migration[8.1] def change add_index :items, "(binary_quantize(embedding)::bit(3)) bit_hamming_ops", using: :hnsw end end ``` ### Sparse Vectors Use the `sparsevec` type to store sparse vectors ```ruby class AddEmbeddingToItems < ActiveRecord::Migration[8.1] def change add_column :items, :embedding, :sparsevec, limit: 3 # dimensions end end ``` Get the nearest neighbors ```ruby embedding = Neighbor::SparseVector.new({0 => 0.9, 1 => 1.3, 2 => 1.1}, 3) Item.nearest_neighbors(:embedding, embedding, distance: "euclidean").first(5) ``` ## MariaDB ### Distance Supported values are: - `euclidean` - `cosine` - `hamming` ### Indexing Vector columns must use `null: false` to add a vector index ```ruby class CreateItems < ActiveRecord::Migration[8.1] def change create_table :items do |t| t.vector :embedding, limit: 3, null: false t.index :embedding, type: :vector end end end ``` ### Binary Vectors Use the `bigint` type to store binary vectors ```ruby class AddEmbeddingToItems < ActiveRecord::Migration[8.1] def change add_column :items, :embedding, :bigint end end ``` Note: Binary vectors can have up to 64 dimensions Get the nearest neighbors by Hamming distance ```ruby Item.nearest_neighbors(:embedding, 5, distance: "hamming").first(5) ``` ## MySQL ### Distance Supported values are: - `euclidean` - `cosine` - `hamming` Note: The `DISTANCE()` function is [only available on HeatWave](https://dev.mysql.com/doc/refman/9.0/en/vector-functions.html) ### Binary Vectors Use the `binary` type to store binary vectors ```ruby class AddEmbeddingToItems < ActiveRecord::Migration[8.1] def change add_column :items, :embedding, :binary end end ``` Get the nearest neighbors by Hamming distance ```ruby Item.nearest_neighbors(:embedding, "\x05", distance: "hamming").first(5) ``` ## sqlite-vec ### Distance Supported values are: - `euclidean` - `cosine` - `taxicab` - `hamming` ### Dimensions For sqlite-vec, it’s a good idea to specify the number of dimensions to ensure all records have the same number. ```ruby class Item < ApplicationRecord has_neighbors :embedding, dimensions: 3 end ``` ### Virtual Tables You can also use [virtual tables](https://alexgarcia.xyz/sqlite-vec/features/knn.html) ```ruby class AddEmbeddingToItems < ActiveRecord::Migration[8.1] def change # Rails 8+ create_virtual_table :items, :vec0, [ "id integer PRIMARY KEY AUTOINCREMENT NOT NULL", "embedding float[3] distance_metric=L2" ] # Rails < 8 execute <<~SQL CREATE VIRTUAL TABLE items USING vec0( id integer PRIMARY KEY AUTOINCREMENT NOT NULL, embedding float[3] distance_metric=L2 ) SQL end end ``` Use `distance_metric=cosine` for cosine distance You can optionally ignore any shadow tables that are created ```ruby ActiveRecord::SchemaDumper.ignore_tables += [ "items_chunks", "items_rowids", "items_vector_chunks00" ] ``` Get the `k` nearest neighbors ```ruby Item.where("embedding MATCH ?", [1, 2, 3].to_s).where(k: 5).order(:distance) ``` Filter by primary key ```ruby Item.where(id: [2, 3]).where("embedding MATCH ?", [1, 2, 3].to_s).where(k: 5).order(:distance) ``` ### Int8 Vectors Use the `type` option for int8 vectors ```ruby class Item < ApplicationRecord has_neighbors :embedding, dimensions: 3, type: :int8 end ``` ### Binary Vectors Use the `type` option for binary vectors ```ruby class Item < ApplicationRecord has_neighbors :embedding, dimensions: 8, type: :bit end ``` Get the nearest neighbors by Hamming distance ```ruby Item.nearest_neighbors(:embedding, "\x05", distance: "hamming").first(5) ``` ## Examples - [Embeddings](#openai-embeddings) with OpenAI - [Binary embeddings](#cohere-embeddings) with Cohere - [Sentence embeddings](#sentence-embeddings) with Informers - [Hybrid search](#hybrid-search) with Informers - [Sparse search](#sparse-search) with Transformers.rb - [Recommendations](#disco-recommendations) with Disco ### OpenAI Embeddings Generate a model ```sh rails generate model Document content:text embedding:vector{1536} rails db:migrate ``` And add `has_neighbors` ```ruby class Document < ApplicationRecord has_neighbors :embedding end ``` Create a method to call the [embeddings API](https://platform.openai.com/docs/guides/embeddings) ```ruby def embed(input) url = "https://api.openai.com/v1/embeddings" headers = { "Authorization" => "Bearer #{ENV.fetch("OPENAI_API_KEY")}", "Content-Type" => "application/json" } data = { input: input, model: "text-embedding-3-small" } response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value) JSON.parse(response.body)["data"].map { |v| v["embedding"] } end ``` Pass your input ```ruby input = [ "The dog is barking", "The cat is purring", "The bear is growling" ] embeddings = embed(input) ``` Store the embeddings ```ruby documents = [] input.zip(embeddings) do |content, embedding| documents << {content: content, embedding: embedding} end Document.insert_all!(documents) ``` And get similar documents ```ruby document = Document.first document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content) ``` See the [complete code](examples/openai/example.rb) ### Cohere Embeddings Generate a model ```sh rails generate model Document content:text embedding:bit{1536} rails db:migrate ``` And add `has_neighbors` ```ruby class Document < ApplicationRecord has_neighbors :embedding end ``` Create a method to call the [embed API](https://docs.cohere.com/reference/embed) ```ruby def embed(input, input_type) url = "https://api.cohere.com/v2/embed" headers = { "Authorization" => "Bearer #{ENV.fetch("CO_API_KEY")}", "Content-Type" => "application/json" } data = { texts: input, model: "embed-v4.0", input_type: input_type, embedding_types: ["ubinary"] } response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value) JSON.parse(response.body)["embeddings"]["ubinary"].map { |e| e.map { |v| v.chr.unpack1("B*") }.join } end ``` Pass your input ```ruby input = [ "The dog is barking", "The cat is purring", "The bear is growling" ] embeddings = embed(input, "search_document") ``` Store the embeddings ```ruby documents = [] input.zip(embeddings) do |content, embedding| documents << {content: content, embedding: embedding} end Document.insert_all!(documents) ``` Embed the search query ```ruby query = "forest" query_embedding = embed([query], "search_query")[0] ``` And search the documents ```ruby Document.nearest_neighbors(:embedding, query_embedding, distance: "hamming").first(5).map(&:content) ``` See the [complete code](examples/cohere/example.rb) ### Sentence Embeddings You can generate embeddings locally with [Informers](https://github.com/ankane/informers). Generate a model ```sh rails generate model Document content:text embedding:vector{384} rails db:migrate ``` And add `has_neighbors` ```ruby class Document < ApplicationRecord has_neighbors :embedding end ``` Load a [model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) ```ruby model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2") ``` Pass your input ```ruby input = [ "The dog is barking", "The cat is purring", "The bear is growling" ] embeddings = model.(input) ``` Store the embeddings ```ruby documents = [] input.zip(embeddings) do |content, embedding| documents << {content: content, embedding: embedding} end Document.insert_all!(documents) ``` And get similar documents ```ruby document = Document.first document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content) ``` See the [complete code](examples/informers/example.rb) ### Hybrid Search You can use Neighbor for hybrid search with [Informers](https://github.com/ankane/informers). Generate a model ```sh rails generate model Document content:text embedding:vector{768} rails db:migrate ``` And add `has_neighbors` and a scope for keyword search ```ruby class Document < ApplicationRecord has_neighbors :embedding scope :search, ->(query) { where("to_tsvector(content) @@ plainto_tsquery(?)", query) .order(Arel.sql("ts_rank_cd(to_tsvector(content), plainto_tsquery(?)) DESC", query)) } end ``` Create some documents ```ruby Document.create!(content: "The dog is barking") Document.create!(content: "The cat is purring") Document.create!(content: "The bear is growling") ``` Generate an embedding for each document ```ruby embed = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5") embed_options = {model_output: "sentence_embedding", pooling: "none"} # specific to embedding model Document.find_each do |document| embedding = embed.(document.content, **embed_options) document.update!(embedding: embedding) end ``` Perform keyword search ```ruby query = "growling bear" keyword_results = Document.search(query).limit(20).load_async ``` And semantic search in parallel (the query prefix is specific to the [embedding model](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5)) ```ruby query_prefix = "Represent this sentence for searching relevant passages: " query_embedding = embed.(query_prefix + query, **embed_options) semantic_results = Document.nearest_neighbors(:embedding, query_embedding, distance: "cosine").limit(20).load_async ``` To combine the results, use Reciprocal Rank Fusion (RRF) ```ruby Neighbor::Reranking.rrf(keyword_results, semantic_results).first(5) ``` Or a reranking model ```ruby rerank = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-xsmall-v1") results = (keyword_results + semantic_results).uniq rerank.(query, results.map(&:content)).first(5).map { |v| results[v[:doc_id]] } ``` See the [complete code](examples/hybrid/example.rb) ### Sparse Search You can generate sparse embeddings locally with [Transformers.rb](https://github.com/ankane/transformers-ruby). Generate a model ```sh rails generate model Document content:text embedding:sparsevec{30522} rails db:migrate ``` And add `has_neighbors` ```ruby class Document < ApplicationRecord has_neighbors :embedding end ``` Load a [model](https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1) to generate embeddings ```ruby class EmbeddingModel def initialize(model_id) @model = Transformers::AutoModelForMaskedLM.from_pretrained(model_id) @tokenizer = Transformers::AutoTokenizer.from_pretrained(model_id) @special_token_ids = @tokenizer.special_tokens_map.map { |_, token| @tokenizer.vocab[token] } end def embed(input) feature = @tokenizer.(input, padding: true, truncation: true, return_tensors: "pt", return_token_type_ids: false) output = @model.(**feature)[0] values = Torch.max(output * feature[:attention_mask].unsqueeze(-1), dim: 1)[0] values = Torch.log(1 + Torch.relu(values)) values[0.., @special_token_ids] = 0 values.to_a end end model = EmbeddingModel.new("opensearch-project/opensearch-neural-sparse-encoding-v1") ``` Pass your input ```ruby input = [ "The dog is barking", "The cat is purring", "The bear is growling" ] embeddings = model.embed(input) ``` Store the embeddings ```ruby documents = [] input.zip(embeddings) do |content, embedding| documents << {content: content, embedding: Neighbor::SparseVector.new(embedding)} end Document.insert_all!(documents) ``` Embed the search query ```ruby query = "forest" query_embedding = model.embed([query])[0] ``` And search the documents ```ruby Document.nearest_neighbors(:embedding, Neighbor::SparseVector.new(query_embedding), distance: "inner_product").first(5).map(&:content) ``` See the [complete code](examples/sparse/example.rb) ### Disco Recommendations You can use Neighbor for online item-based recommendations with [Disco](https://github.com/ankane/disco). We’ll use MovieLens data for this example. Generate a model ```sh rails generate model Movie name:string factors:cube rails db:migrate ``` And add `has_neighbors` ```ruby class Movie < ApplicationRecord has_neighbors :factors, dimensions: 20, normalize: true end ``` Fit the recommender ```ruby data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) ``` Store the item factors ```ruby movies = [] recommender.item_ids.each do |item_id| movies << {name: item_id, factors: recommender.item_factors(item_id)} end Movie.create!(movies) ``` And get similar movies ```ruby movie = Movie.find_by(name: "Star Wars (1977)") movie.nearest_neighbors(:factors, distance: "cosine").first(5).map(&:name) ``` See the complete code for [cube](examples/disco/item_recs_cube.rb) and [pgvector](examples/disco/item_recs_vector.rb) ## History View the [changelog](https://github.com/ankane/neighbor/blob/master/CHANGELOG.md) ## Contributing Everyone is encouraged to help improve this project. Here are a few ways you can help: - [Report bugs](https://github.com/ankane/neighbor/issues) - Fix bugs and [submit pull requests](https://github.com/ankane/neighbor/pulls) - Write, clarify, or fix documentation - Suggest or add new features To get started with development: ```sh git clone https://github.com/ankane/neighbor.git cd neighbor bundle install # Postgres createdb neighbor_test bundle exec rake test:postgresql # SQLite bundle exec rake test:sqlite # MariaDB docker run -e MARIADB_ALLOW_EMPTY_ROOT_PASSWORD=1 -e MARIADB_DATABASE=neighbor_test -p 3307:3306 mariadb:11.8 bundle exec rake test:mariadb # MySQL docker run -e MYSQL_ALLOW_EMPTY_PASSWORD=1 -e MYSQL_DATABASE=neighbor_test -p 3306:3306 mysql:9 bundle exec rake test:mysql ``` ankane-neighbor-ae849b6/Rakefile000066400000000000000000000016421516427166400166520ustar00rootroot00000000000000require "bundler/gem_tasks" require "rake/testtask" namespace :test do Rake::TestTask.new(:postgresql) do |t| t.description = "Run tests for Postgres" t.test_files = FileList["test/**/*_test.rb"].exclude("test/{sqlite,mariadb,mysql}*_test.rb") end Rake::TestTask.new(:sqlite) do |t| t.description = "Run tests for SQLite" t.test_files = FileList["test/**/sqlite*_test.rb"] end Rake::TestTask.new(:mariadb) do |t| t.description = "Run tests for MariaDB" t.test_files = FileList["test/**/mariadb*_test.rb"] end Rake::TestTask.new(:mysql) do |t| t.description = "Run tests for MySQL" t.test_files = FileList["test/**/mysql*_test.rb"] end end task :test do [:postgresql, :sqlite, :mariadb, :mysql].each do |adapter| next if adapter == :sqlite && RUBY_ENGINE == "truffleruby" puts "Using #{adapter}" Rake::Task["test:#{adapter}"].invoke end end task default: :test ankane-neighbor-ae849b6/examples/000077500000000000000000000000001516427166400170205ustar00rootroot00000000000000ankane-neighbor-ae849b6/examples/cohere/000077500000000000000000000000001516427166400202655ustar00rootroot00000000000000ankane-neighbor-ae849b6/examples/cohere/Gemfile000066400000000000000000000000771516427166400215640ustar00rootroot00000000000000source "https://rubygems.org" gemspec path: "../.." gem "pg" ankane-neighbor-ae849b6/examples/cohere/example.rb000066400000000000000000000026641516427166400222550ustar00rootroot00000000000000require "bundler/setup" require "json" require "net/http" require "active_record" require "neighbor" ActiveRecord::Base.establish_connection adapter: "postgresql", database: "neighbor_test" ActiveRecord::Schema.verbose = false ActiveRecord::Schema.define do enable_extension "vector" create_table :documents, force: true do |t| t.text :content t.bit :embedding, limit: 1536 end end class Document < ActiveRecord::Base has_neighbors :embedding end # https://docs.cohere.com/reference/embed def embed(input, input_type) url = "https://api.cohere.com/v2/embed" headers = { "Authorization" => "Bearer #{ENV.fetch("CO_API_KEY")}", "Content-Type" => "application/json" } data = { texts: input, model: "embed-v4.0", input_type: input_type, embedding_types: ["ubinary"] } response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value) JSON.parse(response.body)["embeddings"]["ubinary"].map { |e| e.map { |v| v.chr.unpack1("B*") }.join } end input = [ "The dog is barking", "The cat is purring", "The bear is growling" ] embeddings = embed(input, "search_document") documents = [] input.zip(embeddings) do |content, embedding| documents << {content: content, embedding: embedding} end Document.insert_all!(documents) query = "forest" query_embedding = embed([query], "search_query")[0] pp Document.nearest_neighbors(:embedding, query_embedding, distance: "hamming").first(5).map(&:content) ankane-neighbor-ae849b6/examples/disco/000077500000000000000000000000001516427166400201215ustar00rootroot00000000000000ankane-neighbor-ae849b6/examples/disco/Gemfile000066400000000000000000000001131516427166400214070ustar00rootroot00000000000000source "https://rubygems.org" gemspec path: "../.." gem "disco" gem "pg" ankane-neighbor-ae849b6/examples/disco/item_recs_cube.rb000066400000000000000000000015021516427166400234140ustar00rootroot00000000000000require "bundler/setup" require "active_record" require "disco" require "neighbor" ActiveRecord::Base.establish_connection adapter: "postgresql", database: "neighbor_test" ActiveRecord::Schema.verbose = false ActiveRecord::Schema.define do enable_extension "cube" create_table :movies, force: true do |t| t.string :name t.cube :factors end end class Movie < ActiveRecord::Base has_neighbors :factors, dimensions: 20, normalize: true end data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) movies = [] recommender.item_ids.each do |item_id| movies << {name: item_id, factors: recommender.item_factors(item_id)} end Movie.create!(movies) movie = Movie.find_by(name: "Star Wars (1977)") pp movie.nearest_neighbors(:factors, distance: "cosine").first(5).map(&:name) ankane-neighbor-ae849b6/examples/disco/item_recs_vector.rb000066400000000000000000000014641516427166400240070ustar00rootroot00000000000000require "bundler/setup" require "active_record" require "disco" require "neighbor" ActiveRecord::Base.establish_connection adapter: "postgresql", database: "neighbor_test" ActiveRecord::Schema.verbose = false ActiveRecord::Schema.define do enable_extension "vector" create_table :movies, force: true do |t| t.string :name t.vector :factors, limit: 20 end end class Movie < ActiveRecord::Base has_neighbors :factors end data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) movies = [] recommender.item_ids.each do |item_id| movies << {name: item_id, factors: recommender.item_factors(item_id)} end Movie.insert_all!(movies) movie = Movie.find_by(name: "Star Wars (1977)") pp movie.nearest_neighbors(:factors, distance: "cosine").first(5).map(&:name) ankane-neighbor-ae849b6/examples/disco/user_recs_cube.rb000066400000000000000000000027741516427166400234500ustar00rootroot00000000000000require "bundler/setup" require "active_record" require "disco" require "neighbor" ActiveRecord::Base.establish_connection adapter: "postgresql", database: "neighbor_test" ActiveRecord::Schema.verbose = false ActiveRecord::Schema.define do enable_extension "cube" create_table :movies, force: true do |t| t.string :name t.cube :factors end create_table :users, force: true do |t| t.cube :factors end end # use an extra dimension to map inner product to euclidean class Movie < ActiveRecord::Base has_neighbors :factors, dimensions: 21 end class User < ActiveRecord::Base has_neighbors :factors, dimensions: 20 end data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) # inner product to euclidean # https://gist.github.com/mdouze/e4bdb404dbd976c83fe447e529e5c9dc norms = (recommender.item_factors ** 2).sum(axis: 1) phi = norms.max extra = Numo::SFloat::Math.sqrt(phi - norms) movies = [] recommender.item_ids.each_with_index do |item_id, i| movies << {name: item_id, factors: recommender.item_factors(item_id).append(extra[i])} end Movie.insert_all!(movies) users = [] recommender.user_ids.each do |user_id| users << {id: user_id, factors: recommender.user_factors(user_id)} end User.insert_all!(users) user = User.find(123) pp Movie.nearest_neighbors(:factors, user.factors.append(0), distance: "euclidean").first(5).map(&:name) # excludes rated, so will be different for some users # pp recommender.user_recs(user.id).map { |v| v[:item_id] } ankane-neighbor-ae849b6/examples/disco/user_recs_vector.rb000066400000000000000000000023061516427166400240230ustar00rootroot00000000000000require "bundler/setup" require "active_record" require "disco" require "neighbor" ActiveRecord::Base.establish_connection adapter: "postgresql", database: "neighbor_test" ActiveRecord::Schema.verbose = false ActiveRecord::Schema.define do enable_extension "vector" create_table :movies, force: true do |t| t.string :name t.vector :factors, limit: 20 end create_table :users, force: true do |t| t.vector :factors, limit: 20 end end class Movie < ActiveRecord::Base has_neighbors :factors end class User < ActiveRecord::Base has_neighbors :factors end data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) movies = [] recommender.item_ids.each do |item_id| movies << {name: item_id, factors: recommender.item_factors(item_id)} end Movie.insert_all!(movies) users = [] recommender.user_ids.each do |user_id| users << {id: user_id, factors: recommender.user_factors(user_id)} end User.insert_all!(users) user = User.find(123) pp Movie.nearest_neighbors(:factors, user.factors, distance: "inner_product").first(5).map(&:name) # excludes rated, so will be different for some users # pp recommender.user_recs(user.id).map { |v| v[:item_id] } ankane-neighbor-ae849b6/examples/hybrid/000077500000000000000000000000001516427166400203015ustar00rootroot00000000000000ankane-neighbor-ae849b6/examples/hybrid/Gemfile000066400000000000000000000001331516427166400215710ustar00rootroot00000000000000source "https://rubygems.org" gemspec path: "../.." gem "informers", ">= 1.0.2" gem "pg" ankane-neighbor-ae849b6/examples/hybrid/example.rb000066400000000000000000000046151516427166400222670ustar00rootroot00000000000000require "bundler/setup" require "active_record" require "informers" require "neighbor" ActiveRecord.async_query_executor = :global_thread_pool ActiveRecord::Base.establish_connection adapter: "postgresql", database: "neighbor_test" ActiveRecord::Schema.verbose = false ActiveRecord::Schema.define do enable_extension "vector" create_table :documents, force: true do |t| t.text :content t.vector :embedding, limit: 768 end # optional: add indexes add_index :documents, "to_tsvector('english', coalesce(content, ''))", using: :gin add_index :documents, :embedding, using: :hnsw, opclass: :vector_cosine_ops end class Document < ActiveRecord::Base has_neighbors :embedding scope :search, ->(query, columns: [:content], language: "english") { expression = columns.map { |v| "coalesce(#{connection.quote_column_name(v)}, '')" }.join(" || ' ' || ") where("to_tsvector(?, #{expression}) @@ plainto_tsquery(?, ?)", language, language, query) .order(Arel.sql("ts_rank_cd(to_tsvector(?, #{expression}), plainto_tsquery(?, ?)) DESC", language, language, query)) } end Document.create!(content: "The dog is barking") Document.create!(content: "The cat is purring") Document.create!(content: "The bear is growling") embed = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5") embed_options = {model_output: "sentence_embedding", pooling: "none"} # specific to embedding model Document.find_each do |document| embedding = embed.(document.content, **embed_options) document.update!(embedding: embedding) end query = "growling bear" keyword_results = Document.search(query).limit(20).load_async # the query prefix is specific to the embedding model (https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5) query_prefix = "Represent this sentence for searching relevant passages: " query_embedding = embed.(query_prefix + query, **embed_options) semantic_results = Document.nearest_neighbors(:embedding, query_embedding, distance: "cosine").limit(20).load_async # to combine the results, use Reciprocal Rank Fusion (RRF) p Neighbor::Reranking.rrf(keyword_results, semantic_results).first(5).map { |v| v[:result].content } # or a reranking model rerank = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-xsmall-v1") results = (keyword_results + semantic_results).uniq p rerank.(query, results.map(&:content)).first(5).map { |v| results[v[:doc_id]] }.map(&:content) ankane-neighbor-ae849b6/examples/informers/000077500000000000000000000000001516427166400210245ustar00rootroot00000000000000ankane-neighbor-ae849b6/examples/informers/Gemfile000066400000000000000000000001331516427166400223140ustar00rootroot00000000000000source "https://rubygems.org" gemspec path: "../.." gem "informers", ">= 1.0.2" gem "pg" ankane-neighbor-ae849b6/examples/informers/example.rb000066400000000000000000000016251516427166400230100ustar00rootroot00000000000000require "bundler/setup" require "active_record" require "informers" require "neighbor" ActiveRecord::Base.establish_connection adapter: "postgresql", database: "neighbor_test" ActiveRecord::Schema.verbose = false ActiveRecord::Schema.define do enable_extension "vector" create_table :documents, force: true do |t| t.text :content t.vector :embedding, limit: 384 end end class Document < ActiveRecord::Base has_neighbors :embedding end model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2") input = [ "The dog is barking", "The cat is purring", "The bear is growling" ] embeddings = model.(input) documents = [] input.zip(embeddings) do |content, embedding| documents << {content: content, embedding: embedding} end Document.insert_all!(documents) document = Document.first pp document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content) ankane-neighbor-ae849b6/examples/openai/000077500000000000000000000000001516427166400202735ustar00rootroot00000000000000ankane-neighbor-ae849b6/examples/openai/Gemfile000066400000000000000000000000771516427166400215720ustar00rootroot00000000000000source "https://rubygems.org" gemspec path: "../.." gem "pg" ankane-neighbor-ae849b6/examples/openai/example.rb000066400000000000000000000025251516427166400222570ustar00rootroot00000000000000require "bundler/setup" require "json" require "net/http" require "active_record" require "neighbor" ActiveRecord::Base.establish_connection adapter: "postgresql", database: "neighbor_test" ActiveRecord::Schema.verbose = false ActiveRecord::Schema.define do enable_extension "vector" create_table :documents, force: true do |t| t.text :content t.vector :embedding, limit: 1536 end end class Document < ActiveRecord::Base has_neighbors :embedding end # https://platform.openai.com/docs/guides/embeddings/how-to-get-embeddings # input can be an array with 2048 elements def embed(input) url = "https://api.openai.com/v1/embeddings" headers = { "Authorization" => "Bearer #{ENV.fetch("OPENAI_API_KEY")}", "Content-Type" => "application/json" } data = { input: input, model: "text-embedding-3-small" } response = Net::HTTP.post(URI(url), data.to_json, headers).tap(&:value) JSON.parse(response.body)["data"].map { |v| v["embedding"] } end input = [ "The dog is barking", "The cat is purring", "The bear is growling" ] embeddings = embed(input) documents = [] input.zip(embeddings) do |content, embedding| documents << {content: content, embedding: embedding} end Document.insert_all!(documents) document = Document.first pp document.nearest_neighbors(:embedding, distance: "cosine").first(5).map(&:content) ankane-neighbor-ae849b6/examples/sparse/000077500000000000000000000000001516427166400203155ustar00rootroot00000000000000ankane-neighbor-ae849b6/examples/sparse/Gemfile000066400000000000000000000001251516427166400216060ustar00rootroot00000000000000source "https://rubygems.org" gemspec path: "../.." gem "pg" gem "transformers-rb" ankane-neighbor-ae849b6/examples/sparse/example.rb000066400000000000000000000035611516427166400223020ustar00rootroot00000000000000# good resources # https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/ # https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1 require "bundler/setup" require "active_record" require "neighbor" require "transformers-rb" ActiveRecord::Base.establish_connection adapter: "postgresql", database: "neighbor_test" ActiveRecord::Schema.verbose = false ActiveRecord::Schema.define do enable_extension "vector" create_table :documents, force: true do |t| t.text :content t.sparsevec :embedding, limit: 30522 end end class Document < ActiveRecord::Base has_neighbors :embedding end class EmbeddingModel def initialize(model_id) @model = Transformers::AutoModelForMaskedLM.from_pretrained(model_id) @tokenizer = Transformers::AutoTokenizer.from_pretrained(model_id) @special_token_ids = @tokenizer.special_tokens_map.map { |_, token| @tokenizer.vocab[token] } end def embed(input) feature = @tokenizer.(input, padding: true, truncation: true, return_tensors: "pt", return_token_type_ids: false) output = @model.(**feature)[0] values = Torch.max(output * feature[:attention_mask].unsqueeze(-1), dim: 1)[0] values = Torch.log(1 + Torch.relu(values)) values[0.., @special_token_ids] = 0 values.to_a end end model = EmbeddingModel.new("opensearch-project/opensearch-neural-sparse-encoding-v1") input = [ "The dog is barking", "The cat is purring", "The bear is growling" ] embeddings = model.embed(input) documents = [] input.zip(embeddings) do |content, embedding| documents << {content: content, embedding: Neighbor::SparseVector.new(embedding)} end Document.insert_all!(documents) query = "puppy" query_embedding = model.embed([query])[0] pp Document.nearest_neighbors(:embedding, Neighbor::SparseVector.new(query_embedding), distance: "inner_product").first(5).map(&:content) ankane-neighbor-ae849b6/gemfiles/000077500000000000000000000000001516427166400167755ustar00rootroot00000000000000ankane-neighbor-ae849b6/gemfiles/activerecord72.gemfile000066400000000000000000000003161516427166400231520ustar00rootroot00000000000000source "https://rubygems.org" gemspec path: ".." gem "rake" gem "minitest" gem "activerecord", "~> 7.2.0" gem "pg" gem "sqlite3" gem "sqlite-vec" gem "mysql2" gem "trilogy" gem "railties", require: false ankane-neighbor-ae849b6/gemfiles/activerecord80.gemfile000066400000000000000000000003161516427166400231510ustar00rootroot00000000000000source "https://rubygems.org" gemspec path: ".." gem "rake" gem "minitest" gem "activerecord", "~> 8.0.0" gem "pg" gem "sqlite3" gem "sqlite-vec" gem "mysql2" gem "trilogy" gem "railties", require: false ankane-neighbor-ae849b6/lib/000077500000000000000000000000001516427166400157505ustar00rootroot00000000000000ankane-neighbor-ae849b6/lib/generators/000077500000000000000000000000001516427166400201215ustar00rootroot00000000000000ankane-neighbor-ae849b6/lib/generators/neighbor/000077500000000000000000000000001516427166400217165ustar00rootroot00000000000000ankane-neighbor-ae849b6/lib/generators/neighbor/cube_generator.rb000066400000000000000000000010321516427166400252230ustar00rootroot00000000000000require "rails/generators" require "rails/generators/active_record" module Neighbor module Generators class CubeGenerator < Rails::Generators::Base include ActiveRecord::Generators::Migration source_root File.join(__dir__, "templates") def copy_migration migration_template "cube.rb", "db/migrate/install_neighbor_cube.rb", migration_version: migration_version end def migration_version "[#{ActiveRecord::VERSION::MAJOR}.#{ActiveRecord::VERSION::MINOR}]" end end end end ankane-neighbor-ae849b6/lib/generators/neighbor/sqlite_generator.rb000066400000000000000000000004341516427166400256130ustar00rootroot00000000000000require "rails/generators" module Neighbor module Generators class SqliteGenerator < Rails::Generators::Base source_root File.join(__dir__, "templates") def copy_templates template "sqlite.rb", "config/initializers/neighbor.rb" end end end end ankane-neighbor-ae849b6/lib/generators/neighbor/templates/000077500000000000000000000000001516427166400237145ustar00rootroot00000000000000ankane-neighbor-ae849b6/lib/generators/neighbor/templates/cube.rb.tt000066400000000000000000000002071516427166400256040ustar00rootroot00000000000000class <%= migration_class_name %> < ActiveRecord::Migration<%= migration_version %> def change enable_extension "cube" end end ankane-neighbor-ae849b6/lib/generators/neighbor/templates/sqlite.rb.tt000066400000000000000000000000751516427166400261720ustar00rootroot00000000000000# Load the sqlite-vec extension Neighbor::SQLite.initialize! ankane-neighbor-ae849b6/lib/generators/neighbor/templates/vector.rb.tt000066400000000000000000000002111516427166400261630ustar00rootroot00000000000000class <%= migration_class_name %> < ActiveRecord::Migration<%= migration_version %> def change enable_extension "vector" end end ankane-neighbor-ae849b6/lib/generators/neighbor/vector_generator.rb000066400000000000000000000010401516427166400256060ustar00rootroot00000000000000require "rails/generators" require "rails/generators/active_record" module Neighbor module Generators class VectorGenerator < Rails::Generators::Base include ActiveRecord::Generators::Migration source_root File.join(__dir__, "templates") def copy_migration migration_template "vector.rb", "db/migrate/install_neighbor_vector.rb", migration_version: migration_version end def migration_version "[#{ActiveRecord::VERSION::MAJOR}.#{ActiveRecord::VERSION::MINOR}]" end end end end ankane-neighbor-ae849b6/lib/neighbor.rb000066400000000000000000000014501516427166400200720ustar00rootroot00000000000000# dependencies require "active_support" # adapter hooks require_relative "neighbor/mysql" require_relative "neighbor/postgresql" require_relative "neighbor/sqlite" # modules require_relative "neighbor/reranking" require_relative "neighbor/sparse_vector" require_relative "neighbor/utils" require_relative "neighbor/version" module Neighbor class Error < StandardError; end end ActiveSupport.on_load(:active_record) do require_relative "neighbor/attribute" require_relative "neighbor/model" require_relative "neighbor/normalized_attribute" extend Neighbor::Model begin Neighbor::PostgreSQL.initialize! rescue Gem::LoadError # tries to load pg gem, which may not be available end Neighbor::MySQL.initialize! end require_relative "neighbor/railtie" if defined?(Rails::Railtie) ankane-neighbor-ae849b6/lib/neighbor/000077500000000000000000000000001516427166400175455ustar00rootroot00000000000000ankane-neighbor-ae849b6/lib/neighbor/attribute.rb000066400000000000000000000022431516427166400220760ustar00rootroot00000000000000module Neighbor class Attribute < ActiveRecord::Type::Value delegate :type, :serialize, :deserialize, :cast, to: :new_cast_type def initialize(cast_type:, model:, type:, attribute_name:) @cast_type = cast_type @model = model @type = type @attribute_name = attribute_name end private def cast_value(...) new_cast_type.send(:cast_value, ...) end def new_cast_type @new_cast_type ||= begin if @cast_type.is_a?(ActiveModel::Type::Value) case Utils.adapter(@model) when :sqlite case @type&.to_sym when :int8 Type::SqliteInt8Vector.new when :bit @cast_type when :float32, nil Type::SqliteVector.new else raise ArgumentError, "Unsupported type" end when :mariadb if @model.columns_hash[@attribute_name.to_s]&.type == :integer @cast_type else Type::MysqlVector.new end else @cast_type end else @cast_type end end end end end ankane-neighbor-ae849b6/lib/neighbor/model.rb000066400000000000000000000143231516427166400211750ustar00rootroot00000000000000module Neighbor module Model def has_neighbors(*attribute_names, dimensions: nil, normalize: nil, type: nil) if attribute_names.empty? raise ArgumentError, "has_neighbors requires an attribute name" end attribute_names.map!(&:to_sym) class_eval do @neighbor_attributes ||= {} if @neighbor_attributes.empty? def self.neighbor_attributes parent_attributes = if superclass.respond_to?(:neighbor_attributes) superclass.neighbor_attributes else {} end parent_attributes.merge(@neighbor_attributes || {}) end end attribute_names.each do |attribute_name| raise Error, "has_neighbors already called for #{attribute_name.inspect}" if neighbor_attributes[attribute_name] @neighbor_attributes[attribute_name] = {dimensions: dimensions, normalize: normalize, type: type&.to_sym} end decorate_attributes(attribute_names) do |name, cast_type| Neighbor::Attribute.new(cast_type: cast_type, model: self, type: type, attribute_name: name) end if normalize attribute_names.each do |attribute_name| normalizes attribute_name, with: ->(v) { Neighbor::Utils.normalize(v, column_info: columns_hash[attribute_name.to_s]) } end end return if @neighbor_attributes.size != attribute_names.size validate do adapter = Utils.adapter(self.class) self.class.neighbor_attributes.each do |k, v| value = read_attribute(k) next if value.nil? column_info = self.class.columns_hash[k.to_s] dimensions = v[:dimensions] dimensions ||= column_info&.limit unless column_info&.type == :binary type = v[:type] || Utils.type(adapter, column_info&.type) if !Neighbor::Utils.validate_dimensions(value, type, dimensions, adapter).nil? errors.add(k, "must have #{dimensions} dimensions") end if !Neighbor::Utils.validate_finite(value, type) errors.add(k, "must have finite values") end end end scope :nearest_neighbors, ->(attribute_name, vector, distance:, precision: nil) { attribute_name = attribute_name.to_sym options = neighbor_attributes[attribute_name] raise ArgumentError, "Invalid attribute" unless options normalize = options[:normalize] dimensions = options[:dimensions] type = options[:type] return none if vector.nil? distance = distance.to_s column_info = columns_hash[attribute_name.to_s] column_type = column_info&.type adapter = Neighbor::Utils.adapter(klass) if type && adapter != :sqlite raise ArgumentError, "type only works with SQLite" end operator = Neighbor::Utils.operator(adapter, column_type, distance) raise ArgumentError, "Invalid distance: #{distance}" unless operator # ensure normalize set (can be true or false) normalize_required = Utils.normalize_required?(adapter, column_type) if distance == "cosine" && normalize_required && normalize.nil? raise Neighbor::Error, "Set normalize for cosine distance with cube" end column_attribute = klass.type_for_attribute(attribute_name) vector = column_attribute.cast(vector) dimensions ||= column_info&.limit unless column_info&.type == :binary Neighbor::Utils.validate(vector, dimensions: dimensions, type: type || Utils.type(adapter, column_info&.type), adapter: adapter) vector = Neighbor::Utils.normalize(vector, column_info: column_info) if normalize quoted_attribute = nil query = nil connection_pool.with_connection do |c| quoted_attribute = "#{c.quote_table_name(table_name)}.#{c.quote_column_name(attribute_name)}" query = c.quote(column_attribute.serialize(vector)) end if !precision.nil? if adapter != :postgresql || column_type != :vector raise ArgumentError, "Precision not supported for this type" end case precision.to_s when "half" cast_dimensions = dimensions || column_info&.limit raise ArgumentError, "Unknown dimensions" unless cast_dimensions quoted_attribute += "::halfvec(#{connection_pool.with_connection { |c| c.quote(cast_dimensions.to_i) }})" else raise ArgumentError, "Invalid precision" end end order = Utils.order(adapter, type, operator, quoted_attribute, query) # https://stats.stackexchange.com/questions/146221/is-cosine-similarity-identical-to-l2-normalized-euclidean-distance # with normalized vectors: # cosine similarity = 1 - (euclidean distance)**2 / 2 # cosine distance = 1 - cosine similarity # this transformation doesn't change the order, so only needed for select neighbor_distance = if distance == "cosine" && normalize_required "POWER(#{order}, 2) / 2.0" elsif [:vector, :halfvec, :sparsevec].include?(column_type) && distance == "inner_product" "(#{order}) * -1" else order end # for select, use column_names instead of * to account for ignored columns select_columns = select_values.any? ? [] : column_names select(*select_columns, "#{neighbor_distance} AS neighbor_distance") .where.not(attribute_name => nil) .reorder(Arel.sql(order)) } def nearest_neighbors(attribute_name, **options) attribute_name = attribute_name.to_sym # important! check if neighbor attribute before accessing raise ArgumentError, "Invalid attribute" unless self.class.neighbor_attributes[attribute_name] self.class .where.not(Array(self.class.primary_key).to_h { |k| [k, self[k]] }) .nearest_neighbors(attribute_name, self[attribute_name], **options) end end end end end ankane-neighbor-ae849b6/lib/neighbor/mysql.rb000066400000000000000000000020031516427166400212320ustar00rootroot00000000000000module Neighbor module MySQL def self.initialize! require_relative "type/mysql_vector" require "active_record/connection_adapters/abstract_mysql_adapter" # ensure schema can be dumped ActiveRecord::ConnectionAdapters::AbstractMysqlAdapter::NATIVE_DATABASE_TYPES[:vector] = {name: "vector"} # ensure schema can be loaded unless ActiveRecord::ConnectionAdapters::TableDefinition.method_defined?(:vector) ActiveRecord::ConnectionAdapters::TableDefinition.send(:define_column_methods, :vector) end # prevent unknown OID warning ActiveRecord::ConnectionAdapters::AbstractMysqlAdapter.singleton_class.prepend(RegisterTypes) end module RegisterTypes def initialize_type_map(m) super register_vector_type(m) end def register_vector_type(m) m.register_type %r(^vector)i do |sql_type| limit = extract_limit(sql_type) Type::MysqlVector.new(limit: limit) end end end end end ankane-neighbor-ae849b6/lib/neighbor/normalized_attribute.rb000066400000000000000000000010111516427166400243120ustar00rootroot00000000000000module Neighbor class NormalizedAttribute < ActiveRecord::Type::Value delegate :type, :serialize, :deserialize, to: :@cast_type def initialize(cast_type:, model:, attribute_name:) @cast_type = cast_type @model = model @attribute_name = attribute_name.to_s end def cast(...) Neighbor::Utils.normalize(@cast_type.cast(...), column_info: @model.columns_hash[@attribute_name]) end private def cast_value(...) @cast_type.send(:cast_value, ...) end end end ankane-neighbor-ae849b6/lib/neighbor/postgresql.rb000066400000000000000000000042261516427166400223010ustar00rootroot00000000000000module Neighbor module PostgreSQL def self.initialize! require_relative "type/cube" require_relative "type/halfvec" require_relative "type/sparsevec" require_relative "type/vector" require "active_record/connection_adapters/postgresql_adapter" # ensure schema can be dumped ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:cube] = {name: "cube"} ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:halfvec] = {name: "halfvec"} ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:sparsevec] = {name: "sparsevec"} ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:vector] = {name: "vector"} # ensure schema can be loaded ActiveRecord::ConnectionAdapters::TableDefinition.send(:define_column_methods, :cube, :halfvec, :sparsevec, :vector) # prevent unknown OID warning ActiveRecord::ConnectionAdapters::PostgreSQLAdapter.singleton_class.prepend(RegisterTypes) # support vector[]/halfvec[] ActiveRecord::ConnectionAdapters::PostgreSQL::OID::Array.prepend(ArrayMethods) end module RegisterTypes def initialize_type_map(m = type_map) super m.register_type "cube", Type::Cube.new m.register_type "halfvec" do |_, _, sql_type| limit = extract_limit(sql_type) Type::Halfvec.new(limit: limit) end m.register_type "sparsevec" do |_, _, sql_type| limit = extract_limit(sql_type) Type::Sparsevec.new(limit: limit) end m.register_type "vector" do |_, _, sql_type| limit = extract_limit(sql_type) Type::Vector.new(limit: limit) end end end ArrayWrapper = Struct.new(:to_a) module ArrayMethods def type_cast_array(value, method, ...) if (subtype.is_a?(Neighbor::Type::Vector) || subtype.is_a?(Neighbor::Type::Halfvec)) && method != :deserialize && value.is_a?(::Array) && value.all? { |v| v.is_a?(::Numeric) } super(ArrayWrapper.new(value), method, ...) else super end end end end end ankane-neighbor-ae849b6/lib/neighbor/railtie.rb000066400000000000000000000007611516427166400215270ustar00rootroot00000000000000module Neighbor class Railtie < Rails::Railtie generators do require "rails/generators/generated_attribute" # rails generate model Item embedding:vector{3} Rails::Generators::GeneratedAttribute.singleton_class.prepend(Neighbor::GeneratedAttribute) end end module GeneratedAttribute def parse_type_and_options(type, *, **) if type =~ /\A(vector|halfvec|bit|sparsevec)\{(\d+)\}\z/ return $1, limit: $2.to_i end super end end end ankane-neighbor-ae849b6/lib/neighbor/reranking.rb000066400000000000000000000011351516427166400220520ustar00rootroot00000000000000module Neighbor module Reranking def self.rrf(first_ranking, *rankings, k: 60) rankings.unshift(first_ranking) ranks = [] results = [] rankings.each do |ranking| ranks << ranking.map.with_index.to_h { |v, i| [v, i + 1] } results.concat(ranking) end results = results.uniq.map do |result| score = ranks.sum do |rank| r = rank[result] r ? 1.0 / (k + r) : 0.0 end {result: result, score: score} end results.sort_by { |v| -v[:score] } end end end ankane-neighbor-ae849b6/lib/neighbor/sparse_vector.rb000066400000000000000000000035621516427166400227570ustar00rootroot00000000000000module Neighbor class SparseVector attr_reader :dimensions, :indices, :values NO_DEFAULT = Object.new def initialize(value, dimensions = NO_DEFAULT) if value.is_a?(Hash) if dimensions == NO_DEFAULT raise ArgumentError, "missing dimensions" end from_hash(value, dimensions) else unless dimensions == NO_DEFAULT raise ArgumentError, "extra argument" end from_array(value) end end def to_s "{#{@indices.zip(@values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{@dimensions.to_i}" end def to_a arr = Array.new(dimensions, 0.0) @indices.zip(@values) do |i, v| arr[i] = v end arr end private def from_hash(data, dimensions) elements = data.select { |_, v| v != 0 }.sort @dimensions = dimensions.to_i @indices = elements.map { |v| v[0].to_i } @values = elements.map { |v| v[1].to_f } end def from_array(arr) arr = arr.to_a @dimensions = arr.size @indices = [] @values = [] arr.each_with_index do |v, i| if v != 0 @indices << i @values << v.to_f end end end class << self def from_text(string) elements, dimensions = string.split("/", 2) indices = [] values = [] elements[1..-2].split(",").each do |e| index, value = e.split(":", 2) indices << index.to_i - 1 values << value.to_f end from_parts(dimensions.to_i, indices, values) end private def from_parts(dimensions, indices, values) vec = allocate vec.instance_variable_set(:@dimensions, dimensions) vec.instance_variable_set(:@indices, indices) vec.instance_variable_set(:@values, values) vec end end end end ankane-neighbor-ae849b6/lib/neighbor/sqlite.rb000066400000000000000000000012641516427166400213760ustar00rootroot00000000000000module Neighbor module SQLite # note: this is a public API (unlike PostgreSQL and MySQL) def self.initialize! return if defined?(@initialized) require_relative "type/sqlite_vector" require_relative "type/sqlite_int8_vector" require "sqlite_vec" require "active_record/connection_adapters/sqlite3_adapter" ActiveRecord::ConnectionAdapters::SQLite3Adapter.prepend(InstanceMethods) @initialized = true end module InstanceMethods def configure_connection super db = @raw_connection db.enable_load_extension(1) SqliteVec.load(db) db.enable_load_extension(0) end end end end ankane-neighbor-ae849b6/lib/neighbor/type/000077500000000000000000000000001516427166400205265ustar00rootroot00000000000000ankane-neighbor-ae849b6/lib/neighbor/type/cube.rb000066400000000000000000000017351516427166400217770ustar00rootroot00000000000000module Neighbor module Type class Cube < ActiveRecord::Type::Value def type :cube end def serialize(value) if Utils.array?(value) value = value.to_a if value.first.is_a?(Array) value = value.map { |v| serialize_point(v) }.join(", ") else value = serialize_point(value) end end super(value) end private def cast_value(value) if Utils.array?(value) value.to_a elsif value.is_a?(Numeric) [value] elsif value.is_a?(String) if value.include?("),(") value[1..-1].split("),(").map { |v| v.split(",").map(&:to_f) } else value[1..-1].split(",").map(&:to_f) end else raise "can't cast #{value.class.name} to cube" end end def serialize_point(value) "(#{value.map(&:to_f).join(", ")})" end end end end ankane-neighbor-ae849b6/lib/neighbor/type/halfvec.rb000066400000000000000000000010671516427166400224670ustar00rootroot00000000000000module Neighbor module Type class Halfvec < ActiveRecord::Type::Value def type :halfvec end def serialize(value) if Utils.array?(value) value = "[#{value.to_a.map(&:to_f).join(",")}]" end super(value) end private def cast_value(value) if value.is_a?(String) value[1..-1].split(",").map(&:to_f) elsif Utils.array?(value) value.to_a else raise "can't cast #{value.class.name} to halfvec" end end end end end ankane-neighbor-ae849b6/lib/neighbor/type/mysql_vector.rb000066400000000000000000000012011516427166400235740ustar00rootroot00000000000000module Neighbor module Type class MysqlVector < ActiveRecord::Type::Binary def type :vector end def serialize(value) if Utils.array?(value) value = value.to_a.pack("e*") end super(value) end def deserialize(value) value = super cast_value(value) unless value.nil? end private def cast_value(value) if value.is_a?(String) value.unpack("e*") elsif Utils.array?(value) value.to_a else raise "can't cast #{value.class.name} to vector" end end end end end ankane-neighbor-ae849b6/lib/neighbor/type/sparsevec.rb000066400000000000000000000013321516427166400230450ustar00rootroot00000000000000module Neighbor module Type class Sparsevec < ActiveRecord::Type::Value def type :sparsevec end def serialize(value) if value.is_a?(SparseVector) value = "{#{value.indices.zip(value.values).map { |i, v| "#{i.to_i + 1}:#{v.to_f}" }.join(",")}}/#{value.dimensions.to_i}" end super(value) end private def cast_value(value) if value.is_a?(SparseVector) value elsif value.is_a?(String) SparseVector.from_text(value) elsif Utils.array?(value) value = SparseVector.new(value.to_a) else raise "can't cast #{value.class.name} to sparsevec" end end end end end ankane-neighbor-ae849b6/lib/neighbor/type/sqlite_int8_vector.rb000066400000000000000000000011341516427166400246770ustar00rootroot00000000000000module Neighbor module Type class SqliteInt8Vector < ActiveRecord::Type::Binary def serialize(value) if Utils.array?(value) value = value.to_a.pack("c*") end super(value) end def deserialize(value) value = super cast_value(value) unless value.nil? end private def cast_value(value) if value.is_a?(String) value.unpack("c*") elsif Utils.array?(value) value.to_a else raise "can't cast #{value.class.name} to vector" end end end end end ankane-neighbor-ae849b6/lib/neighbor/type/sqlite_vector.rb000066400000000000000000000011301516427166400237310ustar00rootroot00000000000000module Neighbor module Type class SqliteVector < ActiveRecord::Type::Binary def serialize(value) if Utils.array?(value) value = value.to_a.pack("f*") end super(value) end def deserialize(value) value = super cast_value(value) unless value.nil? end private def cast_value(value) if value.is_a?(String) value.unpack("f*") elsif Utils.array?(value) value.to_a else raise "can't cast #{value.class.name} to vector" end end end end end ankane-neighbor-ae849b6/lib/neighbor/type/vector.rb000066400000000000000000000010641516427166400223560ustar00rootroot00000000000000module Neighbor module Type class Vector < ActiveRecord::Type::Value def type :vector end def serialize(value) if Utils.array?(value) value = "[#{value.to_a.map(&:to_f).join(",")}]" end super(value) end private def cast_value(value) if value.is_a?(String) value[1..-1].split(",").map(&:to_f) elsif Utils.array?(value) value.to_a else raise "can't cast #{value.class.name} to vector" end end end end end ankane-neighbor-ae849b6/lib/neighbor/utils.rb000066400000000000000000000117031516427166400212340ustar00rootroot00000000000000module Neighbor module Utils def self.validate_dimensions(value, type, expected, adapter) dimensions = type == :sparsevec ? value.dimensions : value.size dimensions *= 8 if type == :bit && [:sqlite, :mysql].include?(adapter) if expected && dimensions != expected "Expected #{expected} dimensions, not #{dimensions}" end end def self.validate_finite(value, type) case type when :bit, :integer true when :sparsevec value.values.all?(&:finite?) else value.all?(&:finite?) end end def self.validate(value, dimensions:, type:, adapter:) if (message = validate_dimensions(value, type, dimensions, adapter)) raise Error, message end if !validate_finite(value, type) raise Error, "Values must be finite" end end def self.normalize(value, column_info:) return nil if value.nil? raise Error, "Normalize not supported for type" unless [:cube, :vector, :halfvec].include?(column_info&.type) norm = Math.sqrt(value.sum { |v| v * v }) # store zero vector as all zeros # since NaN makes the distance always 0 # could also throw error norm > 0 ? value.map { |v| v / norm } : value end def self.array?(value) !value.nil? && value.respond_to?(:to_a) end def self.adapter(model) case model.connection_db_config.adapter when /sqlite/i :sqlite when /mysql|trilogy/i model.connection_pool.with_connection { |c| c.try(:mariadb?) } ? :mariadb : :mysql else :postgresql end end def self.type(adapter, column_type) case adapter when :mysql if column_type == :binary :bit else column_type end else column_type end end def self.operator(adapter, column_type, distance) case adapter when :sqlite case distance when "euclidean" "vec_distance_L2" when "cosine" "vec_distance_cosine" when "taxicab" "vec_distance_L1" when "hamming" "vec_distance_hamming" end when :mariadb case column_type when :vector case distance when "euclidean" "VEC_DISTANCE_EUCLIDEAN" when "cosine" "VEC_DISTANCE_COSINE" end when :integer case distance when "hamming" "BIT_COUNT" end else raise ArgumentError, "Unsupported type: #{column_type}" end when :mysql case column_type when :vector case distance when "cosine" "COSINE" when "euclidean" "EUCLIDEAN" end when :binary case distance when "hamming" "BIT_COUNT" end else raise ArgumentError, "Unsupported type: #{column_type}" end else case column_type when :bit case distance when "hamming" "<~>" when "jaccard" "<%>" when "hamming2" "#" end when :vector, :halfvec, :sparsevec case distance when "inner_product" "<#>" when "cosine" "<=>" when "euclidean" "<->" when "taxicab" "<+>" end when :cube case distance when "taxicab" "<#>" when "chebyshev" "<=>" when "euclidean", "cosine" "<->" end else raise ArgumentError, "Unsupported type: #{column_type}" end end end def self.order(adapter, type, operator, quoted_attribute, query) case adapter when :sqlite case type when :int8 "#{operator}(vec_int8(#{quoted_attribute}), vec_int8(#{query}))" when :bit "#{operator}(vec_bit(#{quoted_attribute}), vec_bit(#{query}))" else "#{operator}(#{quoted_attribute}, #{query})" end when :mariadb if operator == "BIT_COUNT" "BIT_COUNT(#{quoted_attribute} ^ #{query})" else "#{operator}(#{quoted_attribute}, #{query})" end when :mysql if operator == "BIT_COUNT" "BIT_COUNT(#{quoted_attribute} ^ #{query})" elsif operator == "COSINE" "DISTANCE(#{quoted_attribute}, #{query}, 'COSINE')" else "DISTANCE(#{quoted_attribute}, #{query}, 'EUCLIDEAN')" end else if operator == "#" "bit_count(#{quoted_attribute} # #{query})" else "#{quoted_attribute} #{operator} #{query}" end end end def self.normalize_required?(adapter, column_type) case adapter when :postgresql column_type == :cube else false end end end end ankane-neighbor-ae849b6/lib/neighbor/version.rb000066400000000000000000000000501516427166400215520ustar00rootroot00000000000000module Neighbor VERSION = "1.0.0" end ankane-neighbor-ae849b6/neighbor.gemspec000066400000000000000000000010541516427166400203440ustar00rootroot00000000000000require_relative "lib/neighbor/version" Gem::Specification.new do |spec| spec.name = "neighbor" spec.version = Neighbor::VERSION spec.summary = "Nearest neighbor search for Rails" spec.homepage = "https://github.com/ankane/neighbor" spec.license = "MIT" spec.author = "Andrew Kane" spec.email = "andrew@ankane.org" spec.files = Dir["*.{md,txt}", "{lib}/**/*"] spec.require_path = "lib" spec.required_ruby_version = ">= 3.3" spec.add_dependency "activerecord", ">= 7.2" end ankane-neighbor-ae849b6/test/000077500000000000000000000000001516427166400161615ustar00rootroot00000000000000ankane-neighbor-ae849b6/test/bit_test.rb000066400000000000000000000042201516427166400203210ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/postgresql" class BitTest < PostgresTest def test_hamming create_bit_items result = Item.find(1).nearest_neighbors(:binary_embedding, distance: "hamming").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [2, 3], result.map(&:neighbor_distance) end def test_hamming_scope create_bit_items result = Item.nearest_neighbors(:binary_embedding, "101", distance: "hamming").first(5) assert_equal [2, 3, 1], result.map(&:id) assert_elements_in_delta [0, 1, 2], result.map(&:neighbor_distance) end def test_hamming2 create_bit_items result = Item.find(1).nearest_neighbors(:binary_embedding, distance: "hamming2").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [2, 3], result.map(&:neighbor_distance) end def test_hamming2_scope create_bit_items result = Item.nearest_neighbors(:binary_embedding, "101", distance: "hamming2").first(5) assert_equal [2, 3, 1], result.map(&:id) assert_elements_in_delta [0, 1, 2], result.map(&:neighbor_distance) end def test_jaccard create_bit_items result = Item.find(2).nearest_neighbors(:binary_embedding, distance: "jaccard").first(3) assert_equal [3, 1], result.map(&:id) assert_elements_in_delta [1/3.0, 1], result.map(&:neighbor_distance) end def test_jaccard_scope create_bit_items result = Item.nearest_neighbors(:binary_embedding, "100", distance: "jaccard").first(5) assert_equal [2, 3, 1], result.map(&:id) assert_elements_in_delta [0.5, 2/3.0, 1], result.map(&:neighbor_distance) end def test_index_scan assert_index_scan Item.nearest_neighbors(:binary_embedding, "101", distance: "hamming") end def test_invalid_dimensions error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(binary_embedding: "01") end assert_equal "Validation failed: Binary embedding must have 3 dimensions", error.message end def create_bit_items Item.create!(id: 1, binary_embedding: "000") Item.create!(id: 2, binary_embedding: "101") Item.create!(id: 3, binary_embedding: "111") end end ankane-neighbor-ae849b6/test/cube_generator_test.rb000066400000000000000000000006031516427166400225300ustar00rootroot00000000000000require_relative "test_helper" require "generators/neighbor/cube_generator" class CubeGeneratorTest < Rails::Generators::TestCase tests Neighbor::Generators::CubeGenerator destination File.expand_path("../tmp", __dir__) setup :prepare_destination def test_works run_generator assert_migration "db/migrate/install_neighbor_cube.rb", /enable_extension "cube"/ end end ankane-neighbor-ae849b6/test/cube_test.rb000066400000000000000000000105361516427166400204700ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/postgresql" class CubeTest < PostgresTest def test_cosine create_items(CosineItem, :cube_embedding) result = CosineItem.find(1).nearest_neighbors(:cube_embedding, distance: "cosine").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [0, 0.05719095841050148], result.map(&:neighbor_distance) end def test_cosine_no_normalize create_items(Item, :cube_embedding) error = assert_raises(Neighbor::Error) do Item.find(1).nearest_neighbors(:cube_embedding, distance: "cosine").first(3) end assert_equal "Set normalize for cosine distance with cube", error.message end def test_euclidean create_items(Item, :cube_embedding) result = Item.find(1).nearest_neighbors(:cube_embedding, distance: "euclidean").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, Math.sqrt(3)], result.map(&:neighbor_distance) end def test_taxicab create_items(Item, :cube_embedding) result = Item.find(1).nearest_neighbors(:cube_embedding, distance: "taxicab").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, 3], result.map(&:neighbor_distance) end def test_chebyshev create_items(Item, :cube_embedding) result = Item.find(1).nearest_neighbors(:cube_embedding, distance: "chebyshev").first(3) assert_equal [2, 3], result.map(&:id).sort # same distance assert_elements_in_delta [1, 1], result.map(&:neighbor_distance) end def test_index_scan assert_index_scan Item.nearest_neighbors(:cube_embedding, [0, 0, 0], distance: "euclidean") end def test_type Item.create!(cube_factors: "(1,2,3)") assert_equal [1, 2, 3], Item.last.cube_factors Item.create!(cube_factors: [1, 2, 3]) assert_equal [1, 2, 3], Item.last.cube_factors Item.create!(cube_factors: 1) assert_equal [1], Item.last.cube_factors Item.create!(cube_factors: [[1, 2, 3], [4, 5, 6]]) assert_equal [[1, 2, 3], [4, 5, 6]], Item.last.cube_factors end def test_cosine_zero create_items(CosineItem, :cube_embedding) CosineItem.create!(id: 4, cube_embedding: [0, 0, 0]) assert_equal [0, 0, 0], CosineItem.last.cube_embedding assert_equal "(0, 0, 0)", CosineItem.connection.select_all("SELECT cube_embedding FROM items WHERE id = 4").first["cube_embedding"] result = CosineItem.find(3).nearest_neighbors(:cube_embedding, distance: "cosine").to_a.last assert_equal 4, result.id assert_in_delta 0.5, result.neighbor_distance result = CosineItem.find(4).nearest_neighbors(:cube_embedding, distance: "cosine").first(3) assert_elements_in_delta [0.5, 0.5, 0.5], result.map(&:neighbor_distance) end def test_large_dimensions error = assert_raises(ActiveRecord::StatementInvalid) do LargeDimensionsItem.create!(cube_embedding: 101.times.to_a) end assert_match "cannot have more than 100 dimensions", error.message end def test_invalid_dimensions error = assert_raises(ActiveRecord::RecordInvalid) do DimensionsItem.create!(cube_embedding: [1, 1]) end assert_equal "Validation failed: Cube embedding must have 3 dimensions", error.message end def test_infinite error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(cube_embedding: [Float::INFINITY, 0, 0]) end assert_equal "Validation failed: Cube embedding must have finite values", error.message end def test_nan error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(cube_embedding: [Float::NAN, 0, 0]) end assert_equal "Validation failed: Cube embedding must have finite values", error.message end def test_normalize item = CosineItem.new item.cube_embedding = [0, 3, 4] assert_elements_in_delta [0, 0.6, 0.8], item.cube_embedding item.save! assert_elements_in_delta [0, 0.6, 0.8], item.cube_embedding assert_elements_in_delta [0, 0.6, 0.8], Item.last.cube_embedding item.cube_embedding = nil item.save! assert_nil item.cube_embedding assert_nil Item.last.cube_embedding end def test_insert CosineItem.insert!({cube_embedding: [0, 3, 4]}) assert_elements_in_delta [0, 0.6, 0.8], Item.last.cube_embedding end def test_insert_all CosineItem.insert_all!([{cube_embedding: [0, 3, 4]}]) assert_elements_in_delta [0, 0.6, 0.8], Item.last.cube_embedding end end ankane-neighbor-ae849b6/test/halfvec_test.rb000066400000000000000000000053541516427166400211640ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/postgresql" class HalfvecTest < PostgresTest def test_cosine create_items(Item, :half_embedding) result = Item.find(1).nearest_neighbors(:half_embedding, distance: "cosine").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [0, 0.05719095841050148], result.map(&:neighbor_distance) end def test_euclidean create_items(Item, :half_embedding) result = Item.find(1).nearest_neighbors(:half_embedding, distance: "euclidean").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, Math.sqrt(3)], result.map(&:neighbor_distance) end def test_taxicab create_items(Item, :half_embedding) result = Item.find(1).nearest_neighbors(:half_embedding, distance: "taxicab").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, 3], result.map(&:neighbor_distance) end def test_inner_product create_items(Item, :half_embedding) result = Item.find(1).nearest_neighbors(:half_embedding, distance: "inner_product").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [6, 4], result.map(&:neighbor_distance) end def test_index_scan assert_index_scan Item.nearest_neighbors(:half_embedding, [0, 0, 0], distance: "cosine") end def test_type Item.create!(half_factors: "[1,2,3]") assert_equal [1, 2, 3], Item.last.half_factors Item.create!(half_factors: [1, 2, 3]) assert_equal [1, 2, 3], Item.last.half_factors end def test_invalid_dimensions error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(half_embedding: [1, 1]) end assert_equal "Validation failed: Half embedding must have 3 dimensions", error.message end def test_infinite error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(half_embedding: [Float::INFINITY, 0, 0]) end assert_equal "Validation failed: Half embedding must have finite values", error.message end def test_nan error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(half_embedding: [Float::NAN, 0, 0]) end assert_equal "Validation failed: Half embedding must have finite values", error.message end def test_array item = Item.create!(half_embeddings: [[1, 2, 3], [4, 5, 6]]) assert_equal [[1, 2, 3], [4, 5, 6]], item.half_embeddings assert_equal [[1, 2, 3], [4, 5, 6]], Item.last.half_embeddings end def test_array_2d item = Item.create!(half_embeddings: [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) assert_equal [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], item.half_embeddings assert_equal [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], Item.last.half_embeddings end end ankane-neighbor-ae849b6/test/mariadb_bit_test.rb000066400000000000000000000016031516427166400220020ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/mariadb" class MariadbBitTest < Minitest::Test def setup MariadbBinaryItem.delete_all end def test_hamming create_bit_items result = MariadbBinaryItem.find(1).nearest_neighbors(:binary_embedding, distance: "hamming").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [2, 3], result.map(&:neighbor_distance) end def test_hamming_scope create_bit_items result = MariadbBinaryItem.nearest_neighbors(:binary_embedding, 5, distance: "hamming").first(5) assert_equal [2, 3, 1], result.map(&:id) assert_elements_in_delta [0, 1, 2], result.map(&:neighbor_distance) end def create_bit_items MariadbBinaryItem.create!(id: 1, binary_embedding: 0) MariadbBinaryItem.create!(id: 2, binary_embedding: 5) MariadbBinaryItem.create!(id: 3, binary_embedding: 7) end end ankane-neighbor-ae849b6/test/mariadb_test.rb000066400000000000000000000054101516427166400211440ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/mariadb" class MariadbTest < Minitest::Test def setup MariadbItem.delete_all end def test_cosine create_items(MariadbCosineItem, :embedding) result = MariadbCosineItem.find(1).nearest_neighbors(:embedding, distance: "cosine").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [0, 0.05719095841050148], result.map(&:neighbor_distance) end def test_euclidean create_items(MariadbItem, :embedding) result = MariadbItem.find(1).nearest_neighbors(:embedding, distance: "euclidean").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, Math.sqrt(3)], result.map(&:neighbor_distance) end def test_index_scan skip "Occasionally freezes server" assert_index_scan MariadbItem.nearest_neighbors(:embedding, [0, 0, 0], distance: "euclidean") end def test_create item = MariadbItem.create!(embedding: [1, 2, 3]) assert_equal [1, 2, 3], item.embedding end def test_vec_totext MariadbItem.create!(embedding: [1, 2, 3]) assert_equal "[1,2,3]", MariadbItem.pluck("VEC_ToText(embedding)").last end def test_vec_fromtext MariadbItem.connection.execute("INSERT INTO mariadb_items (embedding) VALUES (Vec_FromText('[1,2,3]'))") assert_equal [1, 2, 3], MariadbItem.last.embedding end def test_invalid_dimensions error = assert_raises(ActiveRecord::RecordInvalid) do MariadbDimensionsItem.create!(embedding: [1, 1]) end assert_equal "Validation failed: Embedding must have 3 dimensions", error.message end def test_infinite error = assert_raises(ActiveRecord::RecordInvalid) do MariadbItem.create!(embedding: [Float::INFINITY, 0, 0]) end assert_equal "Validation failed: Embedding must have finite values", error.message end def test_nan error = assert_raises(ActiveRecord::RecordInvalid) do MariadbItem.create!(embedding: [Float::NAN, 0, 0]) end assert_equal "Validation failed: Embedding must have finite values", error.message end def assert_index_scan(relation) assert_match "index_mariadb_items_on_embedding", relation.limit(5).explain.inspect end def test_normalize item = MariadbCosineItem.new item.embedding = [0, 3, 4] assert_elements_in_delta [0, 0.6, 0.8], item.embedding item.save! assert_elements_in_delta [0, 0.6, 0.8], item.embedding assert_elements_in_delta [0, 0.6, 0.8], MariadbItem.last.embedding end def test_insert MariadbCosineItem.insert!({embedding: [0, 3, 4]}) assert_elements_in_delta [0, 0.6, 0.8], MariadbItem.last.embedding end def test_insert_all MariadbCosineItem.insert_all!([{embedding: [0, 3, 4]}]) assert_elements_in_delta [0, 0.6, 0.8], MariadbItem.last.embedding end end ankane-neighbor-ae849b6/test/mysql_bit_test.rb000066400000000000000000000022621516427166400215520ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/mysql" class MysqlBitTest < Minitest::Test def setup MysqlItem.delete_all end def test_hamming create_bit_items result = MysqlItem.find(1).nearest_neighbors(:binary_embedding, distance: "hamming").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [2, 3].map { |v| v * 1024 }, result.map(&:neighbor_distance) end def test_hamming_scope create_bit_items result = MysqlItem.nearest_neighbors(:binary_embedding, "\x05" * 1024, distance: "hamming").first(5) assert_equal [2, 3, 1], result.map(&:id) assert_elements_in_delta [0, 1, 2].map { |v| v * 1024 }, result.map(&:neighbor_distance) end def test_invalid_dimensions error = assert_raises(ActiveRecord::RecordInvalid) do MysqlItem.create!(binary_embedding: "\x00" * 1024 + "\x11") end assert_equal "Validation failed: Binary embedding must have 8192 dimensions", error.message end def create_bit_items MysqlItem.create!(id: 1, binary_embedding: "\x00" * 1024) MysqlItem.create!(id: 2, binary_embedding: "\x05" * 1024) MysqlItem.create!(id: 3, binary_embedding: "\x07" * 1024) end end ankane-neighbor-ae849b6/test/mysql_test.rb000066400000000000000000000043471516427166400207220ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/mysql" class MysqlTest < Minitest::Test def setup MysqlItem.delete_all end def test_cosine skip "Requires HeatWave" create_items(MysqlItem, :embedding) result = MysqlItem.find(1).nearest_neighbors(:embedding, distance: "cosine").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [0, 0.05719095841050148], result.map(&:neighbor_distance) end def test_euclidean skip "Requires HeatWave" create_items(MysqlItem, :embedding) result = MysqlItem.find(1).nearest_neighbors(:embedding, distance: "euclidean").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, Math.sqrt(3)], result.map(&:neighbor_distance) end def test_create item = MysqlItem.create!(embedding: [1, 2, 3]) assert_equal [1, 2, 3], item.embedding end def test_vector_to_string MysqlItem.create!(embedding: [1, 2, 3]) assert_equal "[1.00000e+00,2.00000e+00,3.00000e+00]", MysqlItem.pluck("VECTOR_TO_STRING(embedding)").last end def test_string_to_vector MysqlItem.connection.execute("INSERT INTO mysql_items (embedding) VALUES (STRING_TO_VECTOR('[1,2,3]'))") assert_equal [1, 2, 3], MysqlItem.last.embedding end def test_schema file = Tempfile.new connection = MysqlRecord.connection_pool ActiveRecord::SchemaDumper.dump(connection, file) file.rewind contents = file.read refute_match "Could not dump table", contents assert_match %{t.vector "embedding", limit: 3}, contents end def test_invalid_dimensions error = assert_raises(ActiveRecord::RecordInvalid) do MysqlItem.create!(embedding: [1, 1]) end assert_equal "Validation failed: Embedding must have 3 dimensions", error.message end def test_infinite error = assert_raises(ActiveRecord::RecordInvalid) do MysqlItem.create!(embedding: [Float::INFINITY, 0, 0]) end assert_equal "Validation failed: Embedding must have finite values", error.message end def test_nan error = assert_raises(ActiveRecord::RecordInvalid) do MysqlItem.create!(embedding: [Float::NAN, 0, 0]) end assert_equal "Validation failed: Embedding must have finite values", error.message end end ankane-neighbor-ae849b6/test/neighbor_test.rb000066400000000000000000000125341516427166400213470ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/postgresql" class NeighborTest < PostgresTest def test_schema file = Tempfile.new connection = PostgresRecord.connection_pool ActiveRecord::SchemaDumper.dump(connection, file) file.rewind contents = file.read refute_match "Could not dump table", contents assert_match %{t.cube "cube_embedding"}, contents assert_match %{t.vector "embedding", limit: 3}, contents assert_match %{t.halfvec "half_embedding", limit: 3}, contents assert_match %{t.bit "binary_embedding", limit: 3}, contents assert_match %{t.sparsevec "sparse_embedding", limit: 3}, contents assert_match %{t.vector "embeddings", limit: 3, array: true}, contents end def test_connection_leasing PostgresRecord.connection_handler.clear_active_connections! assert_nil PostgresRecord.connection_pool.active_connection? PostgresRecord.connection_pool.with_connection do Item.nearest_neighbors(:embedding, [1, 1, 1], distance: "euclidean") end assert_nil PostgresRecord.connection_pool.active_connection? end def test_composite_primary_key Product.create!(id: [1, "A"], embedding: [1, 1, 1]) Product.create!(id: [1, "B"], embedding: [2, 2, 2]) Product.create!(id: [2, "A"], embedding: [1, 1, 2]) result = Product.first.nearest_neighbors(:embedding, distance: "euclidean").first(3) assert_equal [[2, "A"], [1, "B"]], result.map(&:id) end def test_neighbor_attributes assert_equal Item.neighbor_attributes.keys.sort, [:embedding, :cube_embedding, :half_embedding, :binary_embedding, :sparse_embedding].sort end def test_no_attribute error = assert_raises(ArgumentError) do Item.has_neighbors end assert_equal "has_neighbors requires an attribute name", error.message end def test_already_defined error = assert_raises(Neighbor::Error) do Item.has_neighbors :embedding end assert_equal "has_neighbors already called for :embedding", error.message end def test_relation create_items(Item, :embedding) assert_equal [2], Item.find(1).nearest_neighbors(:embedding, distance: "euclidean").where(id: 2).map(&:id) end # need to use unscope or count(:all) def test_relation_count create_items(Item, :embedding) assert_equal 2, Item.find(1).nearest_neighbors(:embedding, distance: "euclidean").unscope(:select).count assert_equal 2, Item.find(2).nearest_neighbors(:embedding, distance: "euclidean").count(:all) end def test_empty create_items(CosineItem, :embedding) CosineItem.create!(id: 4, embedding: nil) result = CosineItem.find(1).nearest_neighbors(:embedding, distance: "cosine").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [0, 0.05719095841050148], result.map(&:neighbor_distance) assert_empty CosineItem.find(4).nearest_neighbors(:embedding, distance: "cosine").first(3) end def test_scope create_items(CosineItem, :embedding) result = CosineItem.nearest_neighbors(:embedding, [3, 3, 3], distance: "cosine").first(5) assert_equal 3, result.size assert_equal [1, 2], result.map(&:id).first(2).sort # same distance assert_equal 3, result.map(&:id).last assert_elements_in_delta [0, 0, 0.05719095841050148], result.map(&:neighbor_distance) end def test_scope_invalid_dimensions error = assert_raises(Neighbor::Error) do DimensionsItem.nearest_neighbors(:embedding, [3, 3], distance: "euclidean").first(5) end assert_equal "Expected 3 dimensions, not 2", error.message end def test_scope_select create_items(CosineItem, :embedding) item = CosineItem.select(:id, :factors).nearest_neighbors(:embedding, [3, 3, 3], distance: "euclidean").first assert item.has_attribute?(:id) assert item.has_attribute?(:factors) refute item.has_attribute?(:embedding) end def test_default_scope create_items(Item, :embedding) assert_equal [1, 3, 2], DefaultScopeItem.nearest_neighbors(:embedding, [0, 0, 0], distance: "euclidean").pluck(:id) assert_equal [3, 2], DefaultScopeItem.find(1).nearest_neighbors(:embedding, distance: "euclidean").pluck(:id) end def test_pluck create_items(Item, :embedding) assert_equal [1, 3, 2], Item.nearest_neighbors(:embedding, [0, 0, 0], distance: "euclidean").pluck(:id) end def test_reselect create_items(Item, :embedding) result = Item.nearest_neighbors(:embedding, [0, 0, 0], distance: "euclidean").reselect(:id).first(5) assert_equal [1, 3, 2], result.map(&:id) end def test_attribute_not_loaded create_items(Item, :embedding) assert_raises(ActiveModel::MissingAttributeError) do Item.select(:id).find(1).nearest_neighbors(:embedding, distance: "euclidean") end end def test_invalid_distance error = assert_raises(ArgumentError) do Item.nearest_neighbors(:embedding, [1, 2, 3], distance: "bad") end assert_equal "Invalid distance: bad", error.message end def test_invalid_attribute create_items(Item, :embedding) error = assert_raises(ArgumentError) do Item.find(1).nearest_neighbors(:bad, distance: "euclidean") end assert_equal "Invalid attribute", error.message end def test_invalid_attribute_scope error = assert_raises(ArgumentError) do Item.nearest_neighbors(:bad, [0, 0, 0], distance: "euclidean") end assert_equal "Invalid attribute", error.message end end ankane-neighbor-ae849b6/test/reranking_test.rb000066400000000000000000000011501516427166400215220ustar00rootroot00000000000000require_relative "test_helper" class RerankingTest < Minitest::Test def test_rrf keyword_results = ["C"] semantic_results = ["C", "A", "B"] results = Neighbor::Reranking.rrf(keyword_results, semantic_results) assert_equal ["C", "A", "B"], results.map { |v| v[:result] } assert_elements_in_delta [0.03279, 0.01612, 0.01587], results.map { |v| v[:score] } end def test_rrf_k results = Neighbor::Reranking.rrf(["A", "B", "C"], k: 0) assert_equal ["A", "B", "C"], results.map { |v| v[:result] } assert_elements_in_delta [1, 0.5, 1 / 3.0], results.map { |v| v[:score] } end end ankane-neighbor-ae849b6/test/sparsevec_test.rb000066400000000000000000000063401516427166400215430ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/postgresql" class SparsevecTest < PostgresTest def test_cosine create_items(Item, :sparse_embedding) result = Item.find(1).nearest_neighbors(:sparse_embedding, distance: "cosine").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [0, 0.05719095841050148], result.map(&:neighbor_distance) end def test_euclidean create_items(Item, :sparse_embedding) result = Item.find(1).nearest_neighbors(:sparse_embedding, distance: "euclidean").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, Math.sqrt(3)], result.map(&:neighbor_distance) end def test_taxicab create_items(Item, :sparse_embedding) result = Item.find(1).nearest_neighbors(:sparse_embedding, distance: "taxicab").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, 3], result.map(&:neighbor_distance) end def test_inner_product create_items(Item, :sparse_embedding) result = Item.find(1).nearest_neighbors(:sparse_embedding, distance: "inner_product").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [6, 4], result.map(&:neighbor_distance) end def test_index_scan assert_index_scan Item.nearest_neighbors(:sparse_embedding, [0, 0, 0], distance: "cosine") end def test_half_precision create_items(Item, :sparse_embedding) error = assert_raises(ArgumentError) do Item.nearest_neighbors(:sparse_embedding, [0, 0, 0], distance: "euclidean", precision: "half") end assert_equal "Precision not supported for this type", error.message end def test_type Item.create!(sparse_factors: "{1:1,3:2,5:3}/5") factors = Item.last.sparse_factors assert_equal 5, factors.dimensions assert_equal [0, 2, 4], factors.indices assert_equal [1, 2, 3], factors.values assert_equal [1, 0, 2, 0, 3], factors.to_a Item.create!(sparse_factors: [0, 4, 0, 5, 0]) factors = Item.last.sparse_factors assert_equal [0, 4, 0, 5, 0], factors.to_a Item.create!(sparse_factors: Neighbor::SparseVector.new({1 => 6, 2 => 7, 4 => 8}, 5)) factors = Item.last.sparse_factors assert_equal [0, 6, 7, 0, 8], factors.to_a end def test_from_dense embedding = Neighbor::SparseVector.new([1, 0, 2, 0, 3, 0]) assert_equal [1, 0, 2, 0, 3, 0], embedding.to_a assert_equal 6, embedding.dimensions assert_equal [0, 2, 4], embedding.indices assert_equal [1, 2, 3], embedding.values end def test_invalid_dimensions error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(sparse_embedding: Neighbor::SparseVector.new({}, 2)) end assert_equal "Validation failed: Sparse embedding must have 3 dimensions", error.message end def test_infinite error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(sparse_embedding: [Float::INFINITY, 0, 0]) end assert_equal "Validation failed: Sparse embedding must have finite values", error.message end def test_nan error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(sparse_embedding: [Float::NAN, 0, 0]) end assert_equal "Validation failed: Sparse embedding must have finite values", error.message end end ankane-neighbor-ae849b6/test/sqlite_bit_test.rb000066400000000000000000000021461516427166400217070ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/sqlite" class SqliteBitTest < Minitest::Test def setup SqliteItem.delete_all end def test_hamming create_bit_items result = SqliteItem.find(1).nearest_neighbors(:binary_embedding, distance: "hamming").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [2, 3], result.map(&:neighbor_distance) end def test_hamming_scope create_bit_items result = SqliteItem.nearest_neighbors(:binary_embedding, "\x05", distance: "hamming").first(5) assert_equal [2, 3, 1], result.map(&:id) assert_elements_in_delta [0, 1, 2], result.map(&:neighbor_distance) end def test_invalid_dimensions error = assert_raises(ActiveRecord::RecordInvalid) do SqliteItem.create!(binary_embedding: "\x00\x11") end assert_equal "Validation failed: Binary embedding must have 8 dimensions", error.message end def create_bit_items SqliteItem.create!(id: 1, binary_embedding: "\x00") SqliteItem.create!(id: 2, binary_embedding: "\x05") SqliteItem.create!(id: 3, binary_embedding: "\x07") end end ankane-neighbor-ae849b6/test/sqlite_float32_test.rb000066400000000000000000000054201516427166400224010ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/sqlite" class SqliteFloat32Test < Minitest::Test def setup SqliteItem.delete_all end def test_cosine create_items(SqliteItem, :embedding) result = SqliteItem.find(1).nearest_neighbors(:embedding, distance: "cosine").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [0, 0.05719095841050148], result.map(&:neighbor_distance) end def test_euclidean create_items(SqliteItem, :embedding) result = SqliteItem.find(1).nearest_neighbors(:embedding, distance: "euclidean").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, Math.sqrt(3)], result.map(&:neighbor_distance) end def test_taxicab create_items(SqliteItem, :embedding) result = SqliteItem.find(1).nearest_neighbors(:embedding, distance: "taxicab").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, 3], result.map(&:neighbor_distance) end def test_create item = SqliteItem.create!(embedding: [1, 2, 3]) assert_equal [1, 2, 3], item.embedding end def test_vec_to_json SqliteItem.create!(embedding: [1, 2, 3]) assert_equal "[1.000000,2.000000,3.000000]", SqliteItem.pluck("vec_to_json(embedding)").last end def test_schema file = Tempfile.new connection = SqliteItem.connection_pool ignore_tables = ActiveRecord::VERSION::MAJOR >= 8 ? [/_vector_chunks00\z/] : [/\Avec_items/, /\Acosine_items/] with_ignore_tables(ignore_tables) do ActiveRecord::SchemaDumper.dump(connection, file) end file.rewind contents = file.read assert_match %{t.binary "embedding"}, contents if ActiveRecord::VERSION::MAJOR >= 8 assert_match %{create_virtual_table "vec_items", "vec0"}, contents end refute_match "Could not dump table", contents end def test_invalid_dimensions error = assert_raises(ActiveRecord::RecordInvalid) do SqliteItem.create!(embedding: [1, 1]) end assert_match "Validation failed: Embedding must have 3 dimensions", error.message end def test_infinite error = assert_raises(ActiveRecord::RecordInvalid) do SqliteItem.create!(embedding: [Float::INFINITY, 0, 0]) end assert_equal "Validation failed: Embedding must have finite values", error.message end def test_nan error = assert_raises(ActiveRecord::RecordInvalid) do SqliteItem.create!(embedding: [Float::NAN, 0, 0]) end assert_equal "Validation failed: Embedding must have finite values", error.message end def with_ignore_tables(value) previous_value = ActiveRecord::SchemaDumper.ignore_tables begin ActiveRecord::SchemaDumper.ignore_tables = value yield ensure ActiveRecord::SchemaDumper.ignore_tables = previous_value end end end ankane-neighbor-ae849b6/test/sqlite_generator_test.rb000066400000000000000000000006051516427166400231150ustar00rootroot00000000000000require_relative "test_helper" require "generators/neighbor/sqlite_generator" class SqliteGeneratorTest < Rails::Generators::TestCase tests Neighbor::Generators::SqliteGenerator destination File.expand_path("../tmp", __dir__) setup :prepare_destination def test_works run_generator assert_file "config/initializers/neighbor.rb", /Neighbor::SQLite.initialize!/ end end ankane-neighbor-ae849b6/test/sqlite_int8_test.rb000066400000000000000000000017771516427166400220240ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/sqlite" class SqliteInt8Test < Minitest::Test def setup SqliteItem.delete_all end def test_cosine create_items(SqliteItem, :int8_embedding) result = SqliteItem.find(1).nearest_neighbors(:int8_embedding, distance: "cosine").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [0, 0.05719095841050148], result.map(&:neighbor_distance) end def test_euclidean create_items(SqliteItem, :int8_embedding) result = SqliteItem.find(1).nearest_neighbors(:int8_embedding, distance: "euclidean").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, Math.sqrt(3)], result.map(&:neighbor_distance) end def test_taxicab create_items(SqliteItem, :int8_embedding) result = SqliteItem.find(1).nearest_neighbors(:int8_embedding, distance: "taxicab").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, 3], result.map(&:neighbor_distance) end end ankane-neighbor-ae849b6/test/sqlite_virtual_test.rb000066400000000000000000000047541516427166400226260ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/sqlite" class SqliteVirtualTest < Minitest::Test def setup SqliteVecItem.delete_all SqliteCosineItem.delete_all end def test_cosine create_items(SqliteCosineItem, :embedding) relation = SqliteCosineItem.where("embedding MATCH ?", "[1, 1, 1]").order(:distance).limit(3) assert_elements_in_delta [0, 0, 0.05719095841050148], relation.pluck(:distance) assert_match "SCAN cosine_items VIRTUAL TABLE INDEX", relation.explain.inspect relation = SqliteCosineItem.where("embedding MATCH ? AND k = ?", "[1, 1, 1]", 3).order(:distance) assert_elements_in_delta [0, 0, 0.05719095841050148], relation.pluck(:distance) end def test_euclidean create_items(SqliteVecItem, :embedding) relation = SqliteVecItem.where("embedding MATCH ?", [1, 1, 1].to_s).order(:distance).limit(3) assert_equal [1, 3, 2], relation.all.map(&:id) assert_equal [1, 3, 2], relation.pluck(:id) assert_elements_in_delta [0, 1, Math.sqrt(3)], relation.pluck(:distance) assert_match "SCAN vec_items VIRTUAL TABLE INDEX", relation.explain.inspect relation = SqliteVecItem.where("embedding MATCH ? AND k = ?", [1, 1, 1].to_s, 3).order(:distance) assert_elements_in_delta [0, 1, Math.sqrt(3)], relation.pluck(:distance) end def test_no_limit error = assert_raises(ActiveRecord::StatementInvalid) do SqliteVecItem.where("embedding MATCH ?", "[0, 0, 0]").order(:distance).load end assert_match "A LIMIT or 'k = ?' constraint is required on vec0 knn queries.", error.message end def test_where_limit skip if SQLite3::VERSION.to_i < 2 error = assert_raises(ActiveRecord::StatementInvalid) do SqliteVecItem.where.not(embedding: nil).where("embedding MATCH ?", "[0, 0, 0]").order(:distance).limit(3).load end assert_match "A LIMIT or 'k = ?' constraint is required on vec0 knn queries.", error.message end def test_where_k assert SqliteVecItem.where.not(embedding: nil).where("embedding MATCH ? AND k = ?", "[0, 0, 0]", 3).order(:distance).load end def test_where_id create_items(SqliteVecItem, :embedding) relation = SqliteVecItem.where(id: [2, 3]).where("embedding MATCH ?", [1, 1, 1].to_s).where(k: 5).order(:distance) assert_equal [3, 2], relation.pluck(:id) end def test_create_returning_id item = SqliteVecItem.create!(embedding: [1, 2, 3]) # TODO figure out why id not set assert_nil item.id assert_kind_of Integer, SqliteVecItem.last.id end end ankane-neighbor-ae849b6/test/support/000077500000000000000000000000001516427166400176755ustar00rootroot00000000000000ankane-neighbor-ae849b6/test/support/mariadb.rb000066400000000000000000000024631516427166400216260ustar00rootroot00000000000000class MariadbRecord < ActiveRecord::Base self.abstract_class = true establish_connection adapter: "mysql2", database: "neighbor_test", host: "127.0.0.1", port: 3307, username: "root" end begin MariadbRecord.connection.verify! rescue => e abort <<~MSG Database connection failed: #{e.message} To use the Docker container, run: docker run -e MARIADB_ALLOW_EMPTY_ROOT_PASSWORD=1 -e MARIADB_DATABASE=neighbor_test -p 3307:3306 mariadb:11.8 (and wait for it to be ready) MSG end MariadbRecord.connection.instance_eval do create_table :mariadb_items, force: true do |t| t.vector :embedding, limit: 3, null: false t.index :embedding, type: :vector end create_table :mariadb_binary_items, force: true do |t| t.bigint :binary_embedding end end class MariadbItem < MariadbRecord has_neighbors :embedding end class MariadbCosineItem < MariadbRecord has_neighbors :embedding, normalize: true self.table_name = "mariadb_items" end class MariadbDimensionsItem < MariadbRecord has_neighbors :embedding, dimensions: 3 self.table_name = "mariadb_items" end class MariadbBinaryItem < MariadbRecord has_neighbors :binary_embedding end # ensure has_neighbors does not cause model schema to load raise "has_neighbors loading model schema early" if MariadbItem.send(:schema_loaded?) ankane-neighbor-ae849b6/test/support/mysql.rb000066400000000000000000000016451516427166400213750ustar00rootroot00000000000000class MysqlRecord < ActiveRecord::Base self.abstract_class = true establish_connection adapter: (ENV["TEST_TRILOGY"] ? "trilogy" : "mysql2"), database: "neighbor_test", host: "127.0.0.1", username: "root" end begin MysqlRecord.connection.verify! rescue => e abort <<~MSG Database connection failed: #{e.message} To use the Docker container, run: docker run -e MYSQL_ALLOW_EMPTY_PASSWORD=1 -e MYSQL_DATABASE=neighbor_test -p 3306:3306 mysql:9 (and wait for it to be ready) MSG end MysqlRecord.connection.instance_eval do create_table :mysql_items, force: true do |t| t.vector :embedding, limit: 3 t.binary :binary_embedding end end class MysqlItem < MysqlRecord has_neighbors :embedding has_neighbors :binary_embedding, dimensions: 8192 end # ensure has_neighbors does not cause model schema to load raise "has_neighbors loading model schema early" if MysqlItem.send(:schema_loaded?) ankane-neighbor-ae849b6/test/support/postgresql.rb000066400000000000000000000051141516427166400224260ustar00rootroot00000000000000class PostgresRecord < ActiveRecord::Base self.abstract_class = true establish_connection adapter: "postgresql", database: "neighbor_test" end PostgresRecord.connection.instance_eval do enable_extension "cube" enable_extension "vector" create_table :items, force: true do |t| t.cube :cube_embedding t.cube :cube_factors t.vector :embedding, limit: 3 t.vector :factors, limit: 3 t.halfvec :half_embedding, limit: 3 t.halfvec :half_factors, limit: 3 t.bit :binary_embedding, limit: 3 t.sparsevec :sparse_embedding, limit: 3 t.sparsevec :sparse_factors, limit: 5 t.vector :embeddings, limit: 3, array: true t.halfvec :half_embeddings, limit: 3, array: true end add_index :items, :cube_embedding, using: :gist add_index :items, :embedding, using: :hnsw, opclass: :vector_cosine_ops add_index :items, :half_embedding, using: :hnsw, opclass: :halfvec_cosine_ops add_index :items, :binary_embedding, using: :hnsw, opclass: :bit_hamming_ops add_index :items, :sparse_embedding, using: :hnsw, opclass: :sparsevec_cosine_ops add_index :items, "(embedding::halfvec(3)) halfvec_l2_ops", using: :hnsw add_index :items, "(binary_quantize(embedding)::bit(3)) bit_hamming_ops", using: :hnsw create_table :products, primary_key: [:store_id, :name], force: true do |t| t.integer :store_id t.string :name t.vector :embedding, limit: 3 end end class Item < PostgresRecord has_neighbors :embedding, :cube_embedding, :half_embedding, :binary_embedding, :sparse_embedding end class CosineItem < PostgresRecord has_neighbors :embedding has_neighbors :cube_embedding, normalize: true self.table_name = "items" end class DimensionsItem < PostgresRecord has_neighbors :embedding, dimensions: 3 has_neighbors :cube_embedding, dimensions: 3 self.table_name = "items" end class LargeDimensionsItem < PostgresRecord has_neighbors :embedding, dimensions: 16001 has_neighbors :cube_embedding, dimensions: 101 self.table_name = "items" end class DefaultScopeItem < PostgresRecord default_scope { order(:id) } has_neighbors :embedding self.table_name = "items" end class Product < PostgresRecord has_neighbors :embedding end # ensure has_neighbors does not cause model schema to load raise "has_neighbors loading model schema early" if Item.send(:schema_loaded?) class PostgresTest < Minitest::Test def setup Item.delete_all end def assert_index_scan(relation) Item.transaction do Item.connection.execute("SET LOCAL enable_seqscan = off") assert_match "Index Scan", relation.limit(5).explain.inspect end end end ankane-neighbor-ae849b6/test/support/sqlite.rb000066400000000000000000000033251516427166400215260ustar00rootroot00000000000000class SqliteRecord < ActiveRecord::Base self.abstract_class = true establish_connection adapter: "sqlite3", database: ":memory:" end Neighbor::SQLite.initialize! SqliteRecord.connection.instance_eval do create_table :items, force: true do |t| t.binary :embedding t.binary :int8_embedding t.binary :binary_embedding end if ActiveRecord::VERSION::MAJOR >= 8 create_virtual_table :vec_items, :vec0, [ "id integer PRIMARY KEY AUTOINCREMENT NOT NULL", "embedding float[3] distance_metric=L2" ] else execute <<~SQL CREATE VIRTUAL TABLE vec_items USING vec0( id integer PRIMARY KEY AUTOINCREMENT NOT NULL, embedding float[3] distance_metric=L2 ) SQL end if ActiveRecord::VERSION::MAJOR >= 8 create_virtual_table :cosine_items, :vec0, [ "id integer PRIMARY KEY AUTOINCREMENT NOT NULL", "embedding float[3] distance_metric=cosine" ] else execute <<~SQL CREATE VIRTUAL TABLE cosine_items USING vec0( id integer PRIMARY KEY AUTOINCREMENT NOT NULL, embedding float[3] distance_metric=cosine ) SQL end end class SqliteItem < SqliteRecord has_neighbors :embedding, dimensions: 3 has_neighbors :int8_embedding, dimensions: 3, type: :int8 has_neighbors :binary_embedding, dimensions: 8, type: :bit self.table_name = "items" end class SqliteVecItem < SqliteRecord has_neighbors :embedding, dimensions: 3 self.table_name = "vec_items" end class SqliteCosineItem < SqliteRecord has_neighbors :embedding, dimensions: 3 self.table_name = "cosine_items" end # ensure has_neighbors does not cause model schema to load raise "has_neighbors loading model schema early" if SqliteItem.send(:schema_loaded?) ankane-neighbor-ae849b6/test/test_helper.rb000066400000000000000000000013111516427166400210200ustar00rootroot00000000000000require "bundler/setup" Bundler.require(:default) require "minitest/autorun" require "active_record" logger = ActiveSupport::Logger.new(ENV["VERBOSE"] ? STDOUT : nil) ActiveRecord::Schema.verbose = false unless ENV["VERBOSE"] ActiveRecord::Base.logger = logger ActiveRecord::Base.partial_inserts = false class Minitest::Test def assert_elements_in_delta(expected, actual) assert_equal expected.size, actual.size expected.zip(actual) do |exp, act| assert_in_delta exp, act end end def create_items(cls, attribute) vectors = [ [1, 1, 1], [2, 2, 2], [1, 1, 2] ] vectors.each.with_index do |v, i| cls.create!(id: i + 1, attribute => v) end end end ankane-neighbor-ae849b6/test/vector_generator_test.rb000066400000000000000000000006151516427166400231170ustar00rootroot00000000000000require_relative "test_helper" require "generators/neighbor/vector_generator" class VectorGeneratorTest < Rails::Generators::TestCase tests Neighbor::Generators::VectorGenerator destination File.expand_path("../tmp", __dir__) setup :prepare_destination def test_works run_generator assert_migration "db/migrate/install_neighbor_vector.rb", /enable_extension "vector"/ end end ankane-neighbor-ae849b6/test/vector_test.rb000066400000000000000000000076021516427166400210540ustar00rootroot00000000000000require_relative "test_helper" require_relative "support/postgresql" class VectorTest < PostgresTest def test_cosine create_items(Item, :embedding) result = Item.find(1).nearest_neighbors(:embedding, distance: "cosine").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [0, 0.05719095841050148], result.map(&:neighbor_distance) end def test_euclidean create_items(Item, :embedding) result = Item.find(1).nearest_neighbors(:embedding, distance: "euclidean").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, Math.sqrt(3)], result.map(&:neighbor_distance) end def test_taxicab create_items(Item, :embedding) result = Item.find(1).nearest_neighbors(:embedding, distance: "taxicab").first(3) assert_equal [3, 2], result.map(&:id) assert_elements_in_delta [1, 3], result.map(&:neighbor_distance) end def test_inner_product create_items(Item, :embedding) result = Item.find(1).nearest_neighbors(:embedding, distance: "inner_product").first(3) assert_equal [2, 3], result.map(&:id) assert_elements_in_delta [6, 4], result.map(&:neighbor_distance) end def test_index_scan assert_index_scan Item.nearest_neighbors(:embedding, [0, 0, 0], distance: "cosine") end def test_half_precision create_items(Item, :embedding) relation = Item.nearest_neighbors(:embedding, [0, 0, 0], distance: "euclidean", precision: "half") assert_equal [1, 3, 2], relation.pluck(:id) assert_index_scan relation end def test_invalid_precision error = assert_raises(ArgumentError) do Item.nearest_neighbors(:embedding, [1, 2, 3], distance: "euclidean", precision: "bad") end assert_equal "Invalid precision", error.message end def test_type Item.create!(factors: "[1,2,3]") assert_equal [1, 2, 3], Item.last.factors Item.create!(factors: [1, 2, 3]) assert_equal [1, 2, 3], Item.last.factors end def test_cosine_zero create_items(Item, :embedding) Item.create!(id: 4, embedding: [0, 0, 0]) assert_equal [0, 0, 0], Item.last.embedding assert_equal "[0,0,0]", Item.connection.select_all("SELECT embedding FROM items WHERE id = 4").first["embedding"] result = Item.find(3).nearest_neighbors(:embedding, distance: "cosine").to_a.last assert_equal 4, result.id assert result.neighbor_distance.nan? result = Item.find(4).nearest_neighbors(:embedding, distance: "cosine").first(3) assert result.map(&:neighbor_distance).all?(&:nan?) end def test_large_dimensions error = assert_raises(ActiveRecord::StatementInvalid) do LargeDimensionsItem.create!(embedding: 16001.times.to_a) end assert_match "cannot have more than 16000 dimensions", error.message end def test_invalid_dimensions error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(embedding: [1, 1]) end assert_equal "Validation failed: Embedding must have 3 dimensions", error.message end def test_infinite error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(embedding: [Float::INFINITY, 0, 0]) end assert_equal "Validation failed: Embedding must have finite values", error.message end def test_nan error = assert_raises(ActiveRecord::RecordInvalid) do Item.create!(embedding: [Float::NAN, 0, 0]) end assert_equal "Validation failed: Embedding must have finite values", error.message end def test_array item = Item.create!(embeddings: [[1, 2, 3], [4, 5, 6]]) assert_equal [[1, 2, 3], [4, 5, 6]], item.embeddings assert_equal [[1, 2, 3], [4, 5, 6]], Item.last.embeddings end def test_array_2d item = Item.create!(embeddings: [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) assert_equal [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], item.embeddings assert_equal [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], Item.last.embeddings end end