pax_global_header00006660000000000000000000000064137676475720014542gustar00rootroot0000000000000052 comment=cb834897ed3b94c6a6fb37ec57bc7cdd651c6c32 bloomfilter-2.0.3/000077500000000000000000000000001376764757200140625ustar00rootroot00000000000000bloomfilter-2.0.3/.circleci/000077500000000000000000000000001376764757200157155ustar00rootroot00000000000000bloomfilter-2.0.3/.circleci/config.yml000066400000000000000000000015451376764757200177120ustar00rootroot00000000000000# Golang CircleCI 2.0 configuration file # # Check https://circleci.com/docs/2.0/language-go/ for more details version: 2 jobs: build: docker: # specify the version - image: circleci/golang:1.14 working_directory: /go/src/github.com/holiman/bloomfilter steps: - checkout # specify any bash command here prefixed with `run: ` - run: go get -v -t -d ./... - run: (cd v2 && go test -v ./... -coverprofile=coverage.txt -covermode=count ) - run: name: "Codecov upload" command: bash <(curl -s https://codecov.io/bash) - run: name: "Install tools" command: curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.23.8 - run: name: "Lint" command: (cd v2 && golangci-lint run) bloomfilter-2.0.3/.deepsource.toml000066400000000000000000000002341376764757200171720ustar00rootroot00000000000000version = 1 test_patterns = ["*_test.go"] [[analyzers]] name = "go" enabled = true [analyzers.meta] import_paths = ["github.com/holiman/bloomfilter"]bloomfilter-2.0.3/MIT-LICENSE.txt000066400000000000000000000021031376764757200163300ustar00rootroot00000000000000The MIT License (MIT) Copyright © 2014, 2015 Barry Allard Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. bloomfilter-2.0.3/README.md000066400000000000000000000104211376764757200153370ustar00rootroot00000000000000 [![GoDoc](https://godoc.org/github.com/holiman/bloomfilter?status.png)](https://godoc.org/github.com/holiman/bloomfilter) [![CircleCI](https://circleci.com/gh/holiman/bloomfilter.svg?style=svg)](https://app.circleci.com/pipelines/github/holiman/bloomfilter) [![codecov](https://codecov.io/gh/holiman/bloomfilter/branch/master/graph/badge.svg?token=O48l6LbHkL)](https://codecov.io/gh/holiman/bloomfilter) [![DeepSource](https://deepsource.io/gh/holiman/bloomfilter.svg/?label=active+issues&show_trend=true)](https://deepsource.io/gh/holiman/bloomfilter/?ref=repository-badge) # History This bloom filter implementation is a fork from [steakknife/bloomfilter](https://github.com/steakknife/bloomfilter) by Barry Allard. The upstream project is now archived, so this fork exists to fix some bugs and also make a few improvements. Below is the original description. The original implemenation is Copyright © 2014-2016,2018 Barry Allard [MIT license](MIT-LICENSE.txt) All recent changes are copyright © 2019-2020 Martin Holst Swende. ## Installation ``` $ go get github.com/holiman/bloomfilter ``` ## Face-meltingly fast, thread-safe, marshalable, unionable, probability- and optimal-size-calculating Bloom filter in go ### WTF is a bloom filter **TL;DR:** Probabilistic, extra lookup table to track a set of elements kept elsewhere to reduce expensive, unnecessary set element retrieval and/or iterator operations **when an element is not present in the set.** It's a classic time-storage tradeoff algoritm. ### Properties #### [See wikipedia](https://en.wikipedia.org/wiki/Bloom_filter) for algorithm details |Impact|What|Description| |---|---|---| |Good|No false negatives|know for certain if a given element is definitely NOT in the set| |Bad|False positives|uncertain if a given element is in the set| |Bad|Theoretical potential for hash collisions|in very large systems and/or badly hash.Hash64-conforming implementations| |Bad|Add only|Cannot remove an element, it would destroy information about other elements| |Good|Constant storage|uses only a fixed amount of memory| ## Naming conventions (Similar to algorithm) |Variable/function|Description|Range| |---|---|---| |m/M()|number of bits in the bloom filter (memory representation is about m/8 bytes in size)|>=2| |n/N()|number of elements present|>=0| |k/K()|number of keys to use (keys are kept private to user code but are de/serialized to Marshal and file I/O)|>=0| |maxN|maximum capacity of intended structure|>0| |p|maximum allowed probability of collision (for computing m and k for optimal sizing)|>0..<1| - Memory representation should be exactly `24 + 8*(k + (m+63)/64) + unsafe.Sizeof(RWMutex)` bytes. - Serialized (`BinaryMarshaler`) representation should be exactly `72 + 8*(k + (m+63)/64)` bytes. (Disk format is less due to compression.) ## Binary serialization format All values in Little-endian format |Offset|Offset (Hex)|Length (bytes)|Name|Type| |---|---|---|---|---| |0|00|12|magic + version number|`\0\0\0\0\0\0\0\0v02\n`| |12|0c|8|k|`uint64`| |20|14|8|n|`uint64`| |28|1c|8|m|`uint64`| |36|24|k|(keys)|`[k]uint64`| |36+8*k|...|(m+63)/64|(bloom filter)|`[(m+63)/64]uint64`| |36+8\*k+8\*((m+63)/64)|...|48|(SHA384 of all previous fields, hashed in order)|`[48]byte`| - `bloomfilter.Filter` conforms to `encoding.BinaryMarshaler` and `encoding.BinaryUnmarshaler' ## Usage ```go import "github.com/holiman/bloomfilter" const ( maxElements = 100000 probCollide = 0.0000001 ) bf, err := bloomfilter.NewOptimal(maxElements, probCollide) if err != nil { panic(err) } someValue := ... // must conform to hash.Hash64 bf.Add(someValue) if bf.Contains(someValue) { // probably true, could be false // whatever } anotherValue := ... // must also conform to hash.Hash64 if bf.Contains(anotherValue) { panic("This should never happen") } err := bf.WriteFile("1.bf.gz") // saves this BF to a file if err != nil { panic(err) } bf2, err := bloomfilter.ReadFile("1.bf.gz") // read the BF to another var if err != nil { panic(err) } ``` ## Design Where possible, branch-free operations are used to avoid deep pipeline / execution unit stalls on branch-misses. ## Contact - [Issues](https://github.com/holiman/bloomfilter/issues) ## License [MIT license](MIT-LICENSE.txt) Copyright © 2014-2016 Barry Allard Copyright © 2019-2020 Martin Holst Swende bloomfilter-2.0.3/codecov.yml000066400000000000000000000001611376764757200162250ustar00rootroot00000000000000 codecov: require_ci_to_pass: no coverage: status: project: no patch: no comment: layout: "diff" bloomfilter-2.0.3/go.mod000066400000000000000000000000571376764757200151720ustar00rootroot00000000000000module github.com/holiman/bloomfilter go 1.15 bloomfilter-2.0.3/v2/000077500000000000000000000000001376764757200144115ustar00rootroot00000000000000bloomfilter-2.0.3/v2/binarymarshaler.go000066400000000000000000000055101376764757200201240ustar00rootroot00000000000000// Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package v2 import ( "bytes" "crypto/sha512" "encoding/binary" "io" ) // headerMagic is used to disambiguate between this package and the original // steakknife implementation. // Since the key hashing algorithm has changed, the format is no longer // binary compatible var version = []byte("v02\n") var headerMagic = append([]byte{0, 0, 0, 0, 0, 0, 0, 0}, version...) // counter is a utility to count bytes written type counter struct { bytes int } func (c *counter) Write(p []byte) (n int, err error) { count := len(p) c.bytes += count return count, nil } // conforms to encoding.BinaryMarshaler // MarshallToWriter marshalls the filter into the given io.Writer // Binary layout (Little Endian): // // k 1 uint64 // n 1 uint64 // m 1 uint64 // keys [k]uint64 // bits [(m+63)/64]uint64 // hash sha384 (384 bits == 48 bytes) // // size = (3 + k + (m+63)/64) * 8 bytes // func (f *Filter) MarshallToWriter(out io.Writer) (int, [sha512.Size384]byte, error) { var ( c = &counter{0} hasher = sha512.New384() mw = io.MultiWriter(out, hasher, c) hash [sha512.Size384]byte ) f.lock.RLock() defer f.lock.RUnlock() if _, err := mw.Write(headerMagic); err != nil { return c.bytes, hash, err } if err := binary.Write(mw, binary.LittleEndian, []uint64{f.K(), f.n, f.m}); err != nil { return c.bytes, hash, err } if err := binary.Write(mw, binary.LittleEndian, f.keys); err != nil { return c.bytes, hash, err } // Write it in chunks of 5% (but at least 4K). Otherwise, the binary.Write will allocate a // same-size slice of bytes, doubling the memory usage var chunkSize = len(f.bits) / 20 if chunkSize < 512 { chunkSize = 512 // Min 4K bytes (512 uint64s) } buf := make([]byte, chunkSize*8) for start := 0; start < len(f.bits); { end := start + chunkSize if end > len(f.bits) { end = len(f.bits) } for i, x := range f.bits[start:end] { binary.LittleEndian.PutUint64(buf[8*i:], x) } if _, err := mw.Write(buf[0 : (end-start)*8]); err != nil { return c.bytes, hash, err } start = end } // Now we stop using the multiwriter, pick out the hash of what we've // written so far, and then write the hash to the output hashbytes := hasher.Sum(nil) copy(hash[:], hashbytes[:sha512.Size384]) err := binary.Write(out, binary.LittleEndian, hashbytes) return c.bytes + len(hashbytes), hash, err } // MarshalBinary converts a Filter into []bytes func (f *Filter) MarshalBinary() (data []byte, err error) { buf := new(bytes.Buffer) _, _, err = f.MarshallToWriter(buf) if err != nil { return nil, err } data = buf.Bytes() return data, nil } bloomfilter-2.0.3/v2/binaryunmarshaler.go000066400000000000000000000057001376764757200204700ustar00rootroot00000000000000// Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package v2 import ( "bytes" "crypto/sha512" "encoding/binary" "fmt" "hash" "io" ) func unmarshalBinaryHeader(r io.Reader) (k, n, m uint64, err error) { magic := make([]byte, len(headerMagic)) if _, err := io.ReadFull(r, magic); err != nil { return 0, 0, 0, err } if !bytes.Equal(magic, headerMagic) { return 0, 0, 0, fmt.Errorf("incompatible version (wrong magic), got %x", magic) } var knm = make([]uint64, 3) err = binary.Read(r, binary.LittleEndian, knm) if err != nil { return 0, 0, 0, err } k = knm[0] n = knm[1] m = knm[2] if k < KMin { return 0, 0, 0, fmt.Errorf("keys must have length %d or greater (was %d)", KMin, k) } if m < MMin { return 0, 0, 0, fmt.Errorf("number of bits in the filter must be >= %d (was %d)", MMin, m) } return k, n, m, err } func unmarshalBinaryBits(r io.Reader, m uint64) (bits []uint64, err error) { bits, err = newBits(m) if err != nil { return bits, err } bs := make([]byte, 8) for i := 0; i < len(bits) && err == nil; i++ { _, err = io.ReadFull(r, bs) bits[i] = binary.LittleEndian.Uint64(bs) } if err != nil { return nil, err } return bits, nil } func unmarshalBinaryKeys(r io.Reader, k uint64) (keys []uint64, err error) { keys = make([]uint64, k) err = binary.Read(r, binary.LittleEndian, keys) return keys, err } // hashingReader can be used to read from a reader, and simultaneously // do a hash on the bytes that were read type hashingReader struct { reader io.Reader hasher hash.Hash tot int64 } func (h *hashingReader) Read(p []byte) (n int, err error) { n, err = h.reader.Read(p) h.tot += int64(n) if err != nil { return n, err } _, _ = h.hasher.Write(p[:n]) return n, err } // UnmarshalBinary converts []bytes into a Filter // conforms to encoding.BinaryUnmarshaler func (f *Filter) UnmarshalBinary(data []byte) (err error) { buf := bytes.NewBuffer(data) _, err = f.UnmarshalFromReader(buf) return err } func (f *Filter) UnmarshalFromReader(input io.Reader) (n int64, err error) { f.lock.Lock() defer f.lock.Unlock() buf := &hashingReader{ reader: input, hasher: sha512.New384(), } var k uint64 k, f.n, f.m, err = unmarshalBinaryHeader(buf) if err != nil { return buf.tot, err } f.keys, err = unmarshalBinaryKeys(buf, k) if err != nil { return buf.tot, err } f.bits, err = unmarshalBinaryBits(buf, f.m) if err != nil { return buf.tot, err } // Only the hash remains to be read now // so abort the hasher at this point gotHash := buf.hasher.Sum(nil) expHash := make([]byte, sha512.Size384) err = binary.Read(buf, binary.LittleEndian, expHash) if err != nil { return buf.tot, err } if !bytes.Equal(gotHash, expHash) { return buf.tot, errHashMismatch } return buf.tot, nil } bloomfilter-2.0.3/v2/bloomfilter.go000066400000000000000000000063371376764757200172670ustar00rootroot00000000000000// Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // // Copyright © 2020 Martin Holst Swende, continued on the work of Barry Allard package v2 import ( "errors" "hash" "sync" ) var ( errHashMismatch = errors.New("hash mismatch, bloom filter corruption or wrong version") ) // Filter is an opaque Bloom filter type type Filter struct { lock sync.RWMutex bits []uint64 keys []uint64 m uint64 // number of bits the "bits" field should recognize n uint64 // number of inserted elements } // M is the size of Bloom filter, in bits func (f *Filter) M() uint64 { return f.m } // K is the count of keys func (f *Filter) K() uint64 { return uint64(len(f.keys)) } // Add a hashable item, v, to the filter func (f *Filter) Add(v hash.Hash64) { f.AddHash(v.Sum64()) } // rotation sets how much to rotate the hash on each filter iteration. This // is somewhat randomly set to a prime on the lower segment of 64. At 17, the cycle // does not repeat for quite a while, but even for low number of filters the // changes are quite rapid const rotation = 17 // Adds an already hashes item to the filter. // Identical to Add (but slightly faster) func (f *Filter) AddHash(hash uint64) { f.lock.Lock() defer f.lock.Unlock() var ( i uint64 ) for n := 0; n < len(f.keys); n++ { hash = ((hash << rotation) | (hash >> (64 - rotation))) ^ f.keys[n] i = hash % f.m f.bits[i>>6] |= 1 << uint(i&0x3f) } f.n++ } // ContainsHash tests if f contains the (already hashed) key // Identical to Contains but slightly faster func (f *Filter) ContainsHash(hash uint64) bool { f.lock.RLock() defer f.lock.RUnlock() var ( i uint64 r = uint64(1) ) for n := 0; n < len(f.keys) && r != 0; n++ { hash = ((hash << rotation) | (hash >> (64 - rotation))) ^ f.keys[n] i = hash % f.m r &= (f.bits[i>>6] >> uint(i&0x3f)) & 1 } return r != 0 } // Contains tests if f contains v // false: f definitely does not contain value v // true: f maybe contains value v func (f *Filter) Contains(v hash.Hash64) bool { return f.ContainsHash(v.Sum64()) } // Copy f to a new Bloom filter func (f *Filter) Copy() (*Filter, error) { f.lock.RLock() defer f.lock.RUnlock() out, err := f.NewCompatible() if err != nil { return nil, err } copy(out.bits, f.bits) out.n = f.n return out, nil } // UnionInPlace merges Bloom filter f2 into f func (f *Filter) UnionInPlace(f2 *Filter) error { if !f.IsCompatible(f2) { return errors.New("incompatible bloom filters") } f.lock.Lock() defer f.lock.Unlock() for i, bitword := range f2.bits { f.bits[i] |= bitword } // Also update the counters f.n += f2.n return nil } // Union merges f2 and f2 into a new Filter out func (f *Filter) Union(f2 *Filter) (out *Filter, err error) { if !f.IsCompatible(f2) { return nil, errors.New("incompatible bloom filters") } f.lock.RLock() defer f.lock.RUnlock() out, err = f.NewCompatible() if err != nil { return nil, err } for i, bitword := range f2.bits { out.bits[i] = f.bits[i] | bitword } // Also update the counters out.n = f.n + f2.n return out, nil } bloomfilter-2.0.3/v2/bloomfilter_test.go000066400000000000000000000175301376764757200203230ustar00rootroot00000000000000// Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package v2 import ( "fmt" "math/rand" "testing" ) // a read-only type that conforms to hash.Hash64, but only Sum64() works. // It is set by writing the underlying value. type hashableUint64 uint64 func (h hashableUint64) Write([]byte) (int, error) { panic("Unimplemented") } func (h hashableUint64) Sum([]byte) []byte { panic("Unimplemented") } func (h hashableUint64) Reset() { panic("Unimplemented") } func (h hashableUint64) BlockSize() int { panic("Unimplemented") } func (h hashableUint64) Size() int { panic("Unimplemented") } func (h hashableUint64) Sum64() uint64 { return uint64(h) } func hashableUint64Values() []hashableUint64 { return []hashableUint64{ 0, 7, 0x0c0ffee0, 0xdeadbeef, 0xffffffff, } } func hashableUint64NotValues() []hashableUint64 { return []hashableUint64{ 1, 5, 42, 0xa5a5a5a5, 0xfffffffe, } } func Test0(t *testing.T) { bf, _ := New(10000, 5) t.Log("Filled ratio before adds :", bf.PreciseFilledRatio()) for _, x := range hashableUint64Values() { bf.Add(x) } t.Log("Filled ratio after adds :", bf.PreciseFilledRatio()) // these may or may not be true for _, y := range hashableUint64Values() { if bf.Contains(y) { t.Log("value in set querties: may contain ", y) } else { t.Fatal("value in set queries: definitely does not contain ", y, ", but it should") } } // these must all be false for _, z := range hashableUint64NotValues() { if bf.Contains(z) { t.Log("value not in set queries: may or may not contain ", z) } else { t.Log("value not in set queries: definitely does not contain ", z, " which is correct") } } } func TestUnion(t *testing.T) { f1, _ := New(8*500, 4) tmp, _ := New(8*500, 4) if _, err := tmp.Union(f1); err == nil { t.Errorf("Incompatible, should error") } f2, err := f1.NewCompatible() if err != nil { t.Fatal(err) } rand.Seed(1337) // Add some content var tests = make([]hashableUint64, 200) for i := 0; i < len(tests); i++ { tests[i] = hashableUint64(rand.Uint64()) if i&1 == 0 { f1.Add(tests[i]) } else { f2.Add(tests[i]) } } unionF, err := f2.Union(f1) if err != nil { t.Fatal(err) } copyF, err := unionF.Copy() if err != nil { t.Fatal(err) } for i, v := range tests { if !unionF.Contains(v) { t.Errorf("missing item %d", i) } if !copyF.Contains(v) { t.Errorf("missing item %d", i) } if i&1 == 0 { if !f1.Contains(v) { t.Errorf("missing item %d", i) } if f2.Contains(v) { t.Errorf("f2 has item it shouldn't have") } } else { if !f2.Contains(v) { t.Errorf("missing item %d", i) } if f1.Contains(v) { t.Errorf("f1 has item it shouldn't have") } } } // And test merging f1 into f2 if err := f2.UnionInPlace(f1); err != nil { t.Fatal(err) } for i, v := range tests { if !f2.Contains(v) { t.Errorf("missing item %d", i) } if i&1 == 0 { if !f1.Contains(v) { t.Errorf("missing item %d", i) } } else { if f1.Contains(v) { t.Errorf("f1 has item it shouldn't have") } } } } func TestFPRate(t *testing.T) { f, _ := New(8*32, 4) f.n = 101 // "insert" 101 items // yes we could add some more tests here... have, want := f.FalsePosititveProbability(), 0.402507 if int(1000*have) != int(1000*want) { t.Errorf("have %08f, want %f", have, want) } } func BenchmarkAddX10kX5(b *testing.B) { bf, _ := New(10000, 5) b.Run("add-10kx5", func(b *testing.B) { b.ReportAllocs() for i := 0; i < b.N; i++ { bf.Add(hashableUint64(rand.Uint32())) } }) b.Run("add-10kx5-hash", func(b *testing.B) { b.ReportAllocs() for i := 0; i < b.N; i++ { bf.AddHash(uint64(rand.Uint32())) } }) } func TestAddX10kX5(t *testing.T) { b1, _ := New(10000, 5) b2, _ := b1.NewCompatible() verify := func() { for i := 0; i < len(b1.bits); i++ { if b1.bits[i] != b2.bits[i] { t.Fatalf("error at bit %d!", i) } } } for i := 0; i < 1000000; i++ { v := hashableUint64(rand.Uint32()) b1.Add(v) b2.AddHash(v.Sum64()) verify() if !b2.Contains(v) { t.Fatal("contain error") } } } func BenchmarkContains1kX10kX5(b *testing.B) { bf, _ := New(10000, 5) for i := 0; i < 1000; i++ { bf.Add(hashableUint64(rand.Uint32())) } b.Run("contains", func(b *testing.B) { for i := 0; i < b.N; i++ { bf.Contains(hashableUint64(rand.Uint32())) } }) b.Run("containsHash", func(b *testing.B) { for i := 0; i < b.N; i++ { bf.ContainsHash(uint64(rand.Uint32())) } }) } func BenchmarkContains100kX10BX20(b *testing.B) { rand.Seed(1337) b.StopTimer() bf, _ := New(10*1000*1000*1000, 20) for i := 0; i < 100*1000; i++ { bf.Add(hashableUint64(rand.Uint32())) } b.Run("contains", func(b *testing.B) { for i := 0; i < b.N; i++ { bf.Contains(hashableUint64(rand.Uint32())) } }) b.Run("containshash", func(b *testing.B) { for i := 0; i < b.N; i++ { bf.ContainsHash(uint64(rand.Uint32())) } }) } func TestContains(t *testing.T) { rand.Seed(1337) bf, _ := New(10*1000*1000, 20) for i := 0; i < 100*10000; i++ { x := hashableUint64(rand.Uint32()) bf.Add(x) if !bf.Contains(x) { t.Fatalf("Did not contain newly added elem: %d", x.Sum64()) } } } //BenchmarkUnionInPlace/union-8-6 15270 77848 ns/op func BenchmarkUnionInPlace(b *testing.B) { var filters []*Filter b1, _ := New(813129, 6) for i := 0; i < 2000; i++ { b1.Add(hashableUint64(rand.Uint32())) } filters = append(filters, b1) for i := 0; i < 7; i++ { b, _ := b1.NewCompatible() filters = append(filters, b) } b.ResetTimer() b.Run("union-8", func(b *testing.B) { for i := 0; i < b.N; i++ { for _, bx := range filters { _ = b1.UnionInPlace(bx) } } }) } func BenchmarkContains94percentMisses(b *testing.B) { // This test should produce about // 5.4K hits and 94k misses rand.Seed(1337) b.StopTimer() bf, _ := New(10*1000*1000, 20) for i := 0; i < 100*1000; i++ { bf.Add(hashableUint64(rand.Uint32())) } b.Run("contains", func(b *testing.B) { for i := 0; i < b.N; i++ { bf.Contains(hashableUint64(rand.Uint32())) } }) b.Run("containsHash", func(b *testing.B) { for i := 0; i < b.N; i++ { bf.ContainsHash(uint64(rand.Uint32())) } }) } // This test is quite long-running, thus disabled func TestHitrate(t *testing.T) { t.Skip("Long-running test, use only for sanity-checking") /** After changes: Fill ratio: 9.303936 % Theoretical hitrate : 0.007493 % Hit rate (100K random tests): 0.009000 % (9 out of 100000) Hit rate (100K random tests): 0.009000 % (9 out of 100000) Zero-filter Hit rate (100K random tests): 9.373000 % (9373 out of 100000) 1-filter Hit rate: 9.474021 % (888 out of 9373) Original changes: Fill ratio: 9.303647 % Theoretical hitrate : 0.007492 % Hit rate (100K random tests): 2.658000 % (2658 out of 100000) Zero-filter Hit rate (100K random tests): 9.456000 % (9456 out of 100000) 1-filter Hit rate: 53.489848 % (5058 out of 9456) */ // 512 MB bloom filter f, _ := New(512*1024*1024*8, 4) // Fill it with 100M items for i := 0; i < 100*1024*1024; i++ { val := rand.Uint64() f.AddHash(val) if !f.ContainsHash(val) { t.Fatalf("Missing value (just inserted) %d", val) } } // Test individual matches numTests := 100000 hits := 0 for i := 0; i < numTests; i++ { h := rand.Uint64() if f.ContainsHash(h) { hits++ } } fmt.Printf("Error rate: %f %%\n", 100*f.FalsePosititveProbability()) // With four keys, we should obtain fillrate^4 chance of false positive fp := f.PreciseFilledRatio() fmt.Printf("Fill ratio: %02f %%\n", 100*fp) fmt.Printf("Theoretical hitrate : %02f %%\n", 100*fp*fp*fp*fp) fmt.Printf("Hit rate (100K random tests): %02f %% (%d out of %d) \n", 100*float64(hits)/float64(numTests), hits, numTests) } bloomfilter-2.0.3/v2/conformance.go000066400000000000000000000013721376764757200172350ustar00rootroot00000000000000// Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package v2 import ( "encoding" "encoding/gob" "encoding/json" "io" ) // compile-time conformance tests var ( _ encoding.BinaryMarshaler = (*Filter)(nil) _ encoding.BinaryUnmarshaler = (*Filter)(nil) _ io.ReaderFrom = (*Filter)(nil) _ io.WriterTo = (*Filter)(nil) _ gob.GobDecoder = (*Filter)(nil) _ gob.GobEncoder = (*Filter)(nil) _ json.Marshaler = (*Filter)(nil) _ json.Unmarshaler = (*Filter)(nil) ) bloomfilter-2.0.3/v2/fileio.go000066400000000000000000000053471376764757200162200ustar00rootroot00000000000000// Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package v2 import ( "compress/gzip" _ "encoding/gob" // make sure gob is available "encoding/json" "errors" "io" "os" ) // ReadFrom r and overwrite f with new Bloom filter data func (f *Filter) ReadFrom(r io.Reader) (n int64, err error) { f2, n, err := ReadFrom(r) if err != nil { return -1, err } f.lock.Lock() defer f.lock.Unlock() f.m = f2.m f.n = f2.n f.bits = f2.bits f.keys = f2.keys return n, nil } // ReadFrom Reader r into a lossless-compressed Bloom filter f func ReadFrom(r io.Reader) (f *Filter, n int64, err error) { f = new(Filter) rawR, err := gzip.NewReader(r) if err != nil { return nil, -1, err } defer rawR.Close() n, err = f.UnmarshalFromReader(rawR) if err != nil { return nil, -1, err } return f, n, nil } // ReadFile from filename into a lossless-compressed Bloom Filter f // Suggested file extension: .bf.gz func ReadFile(filename string) (f *Filter, n int64, err error) { r, err := os.Open(filename) if err != nil { return nil, -1, err } defer r.Close() return ReadFrom(r) } // WriteTo a Writer w from lossless-compressed Bloom Filter f func (f *Filter) WriteTo(w io.Writer) (n int64, err error) { f.lock.RLock() defer f.lock.RUnlock() rawW := gzip.NewWriter(w) defer rawW.Close() intN, _, err := f.MarshallToWriter(rawW) //intN, _, err := f.MarshallToWriter(w) n = int64(intN) return n, err } // WriteFile filename from a a lossless-compressed Bloom Filter f // Suggested file extension: .bf.gz func (f *Filter) WriteFile(filename string) (n int64, err error) { w, err := os.Create(filename) if err != nil { return -1, err } defer w.Close() return f.WriteTo(w) } type jsonType struct { Version string `json:"version"` Bits []uint64 `json:"bits"` Keys []uint64 `json:"keys"` M uint64 `json:"m"` N uint64 `json:"n"` } func (f *Filter) MarshalJSON() ([]byte, error) { return json.Marshal(&jsonType{ string(version), f.bits, f.keys, f.m, f.n, }) } func (f *Filter) UnmarshalJSON(data []byte) error { var j jsonType if err := json.Unmarshal(data, &j); err != nil { return err } if j.Version != string(version) { return errors.New("incompatible version") } f.bits = j.Bits f.keys = j.Keys f.n = j.N f.m = j.M return nil } // GobDecode conforms to interface gob.GobDecoder func (f *Filter) GobDecode(data []byte) error { return f.UnmarshalBinary(data) } // GobEncode conforms to interface gob.GobEncoder func (f *Filter) GobEncode() ([]byte, error) { return f.MarshalBinary() } bloomfilter-2.0.3/v2/fileio_test.go000066400000000000000000000105401376764757200172460ustar00rootroot00000000000000// Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // Copyright © 2018, 2020 Martin Holst Swende // MIT license // package v2 import ( "bytes" "crypto/sha512" "encoding/json" "fmt" "math/rand" "os" "path/filepath" "runtime" "testing" ) type devnull struct{} func (d devnull) Write(p []byte) (n int, err error) { return len(p), nil } func TestWriteRead(t *testing.T) { // minimal filter f, _ := New(8*1024*100, 5) // Add some content var tests = make([]hashableUint64, 20) for i := 0; i < 20; i++ { tests[i] = hashableUint64(rand.Uint64()) f.Add(tests[i]) } verify := func(t *testing.T, f *Filter) { for i, v := range tests { if !f.Contains(v) { t.Errorf("missing item %d", i) } } } t.Run("binary", func(t *testing.T) { var b bytes.Buffer _, err := f.WriteTo(&b) if err != nil { t.Fatal(err) } cpy := append([]byte{}, b.Bytes()...) var f2 *Filter if f2, _, err = ReadFrom(&b); err != nil { t.Fatal(err) } verify(t, f2) // test overwrite f3, _ := New(8*5, 3) if _, err = f3.ReadFrom(bytes.NewReader(cpy)); err != nil { t.Fatal(err) } verify(t, f3) }) t.Run("gob", func(t *testing.T) { data, err := f.GobEncode() if err != nil { t.Fatal(err) } var f2 Filter err = f2.GobDecode(data) if err != nil { t.Fatal(err) } verify(t, &f2) }) t.Run("json", func(t *testing.T) { data, err := json.Marshal(f) if err != nil { t.Fatal(err) } var f2 Filter if err = json.Unmarshal(data, &f2); err != nil { t.Fatal(err) } verify(t, &f2) }) t.Run("file", func(t *testing.T) { fName := filepath.Join(os.TempDir(), "temp.deleteme.gz") if _, err := f.WriteFile(fName); err != nil { t.Fatal(err) } defer os.Remove(fName) if f2, _, err := ReadFile(fName); err != nil { t.Fatal(err) } else { verify(t, f2) } }) } func TestCorruption(t *testing.T) { // minimal filter f, _ := New(8*32, 5) // Add some content var tests = make([]hashableUint64, 20) for i := 0; i < 20; i++ { tests[i] = hashableUint64(rand.Uint64()) f.Add(tests[i]) } t.Run("binary", func(t *testing.T) { var b bytes.Buffer _, err := f.WriteTo(&b) if err != nil { t.Fatal(err) } buf := b.Bytes() buf[len(buf)/2] ^= 1 if _, _, err := ReadFrom(&b); err == nil { t.Errorf("expected error") } }) t.Run("gob", func(t *testing.T) { data, err := f.GobEncode() if err != nil { t.Fatal(err) } // Flip a bit data[len(data)/2] ^= 1 var f2 Filter err = f2.GobDecode(data) if err == nil { t.Errorf("expected error") } }) } func bToMb(b uint64) uint64 { return b / 1024 / 1024 } func PrintMemUsage() { var m runtime.MemStats runtime.ReadMemStats(&m) // For info on each, see: https://golang.org/pkg/runtime/#MemStats fmt.Printf("Alloc = %v MiB", bToMb(m.Alloc)) fmt.Printf("\tTotalAlloc = %v MiB", bToMb(m.TotalAlloc)) fmt.Printf("\tSys = %v MiB", bToMb(m.Sys)) fmt.Printf("\tNumGC = %v\n", m.NumGC) } func TestWrite(t *testing.T) { // 1Mb f, _ := New(4*8*1024*1024, 1) fmt.Printf("Allocated 1mb filter\n") PrintMemUsage() _, _ = f.WriteTo(devnull{}) fmt.Printf("Wrote filter to devnull\n") PrintMemUsage() } // fillRandom fills the filter with N random values, where N is roughly half // the size of the number of uint64's in the filter func fillRandom(f *Filter) { num := len(f.bits) * 4 for i := 0; i < num; i++ { f.AddHash(uint64(rand.Int63())) } } // TestMarshaller tests that it writes outputs correctly. func TestMarshaller(t *testing.T) { h1 := sha512.New384() h2 := sha512.New384() f, _ := New(1*8*1024*1024, 1) fillRandom(f) // Marshall using writer _, _, _ = f.MarshallToWriter(h1) // Marshall as a blob data, _ := f.MarshalBinary() _, _ = h2.Write(data) if have, want := h1.Sum(nil), h2.Sum(nil); !bytes.Equal(have, want) { t.Errorf("Marshalling error, have %x want %x", have, want) } } func BenchmarkWrite1Mb(b *testing.B) { // 1Mb f, _ := New(1*8*1024*1024, 1) f.Add(hashableUint64(0)) f.Add(hashableUint64(1)) f.Add(hashableUint64(1 << 3)) f.Add(hashableUint64(1 << 40)) f.Add(hashableUint64(1 << 23)) f.Add(hashableUint64(1 << 16)) f.Add(hashableUint64(1 << 28)) b.ReportAllocs() for i := 0; i < b.N; i++ { _, _ = f.WriteTo(devnull{}) } } bloomfilter-2.0.3/v2/go.mod000066400000000000000000000000621376764757200155150ustar00rootroot00000000000000module github.com/holiman/bloomfilter/v2 go 1.15 bloomfilter-2.0.3/v2/iscompatible.go000066400000000000000000000014671376764757200174230ustar00rootroot00000000000000// Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package v2 // returns 0 if equal, does not compare len(b0) with len(b1) func noBranchCompareUint64s(b0, b1 []uint64) uint64 { r := uint64(0) for i, b0i := range b0 { r |= b0i ^ b1[i] } return r } // IsCompatible is true if f and f2 can be Union()ed together func (f *Filter) IsCompatible(f2 *Filter) bool { f.lock.RLock() defer f.lock.RUnlock() f2.lock.RLock() defer f2.lock.RUnlock() // 0 is true, non-0 is false compat := f.M() ^ f2.M() compat |= f.K() ^ f2.K() compat |= noBranchCompareUint64s(f.keys, f2.keys) return compat == 0 } bloomfilter-2.0.3/v2/new.go000066400000000000000000000057301376764757200155360ustar00rootroot00000000000000// Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package v2 import ( crand "crypto/rand" "encoding/binary" "fmt" "math" ) const ( MMin = 2 // MMin is the minimum Bloom filter bits count KMin = 1 // KMin is the minimum number of keys Uint64Bytes = 8 // Uint64Bytes is the number of bytes in type uint64 ) // OptimalK calculates the optimal k value for creating a new Bloom filter // maxn is the maximum anticipated number of elements func OptimalK(m, maxN uint64) uint64 { return uint64(math.Ceil(float64(m) * math.Ln2 / float64(maxN))) } // OptimalM calculates the optimal m value for creating a new Bloom filter // p is the desired false positive probability // optimal m = ceiling( - n * ln(p) / ln(2)**2 ) func OptimalM(maxN uint64, p float64) uint64 { return uint64(math.Ceil(-float64(maxN) * math.Log(p) / (math.Ln2 * math.Ln2))) } // New Filter with CSPRNG keys // // m is the size of the Bloom filter, in bits, >= 2 // // k is the number of random keys, >= 1 func New(m, k uint64) (*Filter, error) { return NewWithKeys(m, newRandKeys(m, k)) } func newRandKeys(m uint64, k uint64) []uint64 { keys := make([]uint64, k) if err := binary.Read(crand.Reader, binary.LittleEndian, keys); err != nil { panic(fmt.Sprintf("Cannot read %d bytes from CSRPNG crypto/rand.Read (err=%v)", Uint64Bytes, err)) } return keys } // NewCompatible Filter compatible with f func (f *Filter) NewCompatible() (*Filter, error) { return NewWithKeys(f.m, f.keys) } // NewOptimal Bloom filter with random CSPRNG keys func NewOptimal(maxN uint64, p float64) (*Filter, error) { m := OptimalM(maxN, p) k := OptimalK(m, maxN) return New(m, k) } // uniqueKeys is true if all keys are unique func uniqueKeys(keys []uint64) bool { for j := 0; j < len(keys)-1; j++ { for i := j + 1; i < len(keys); i++ { if keys[i] == keys[j] { return false } } } return true } // NewWithKeys creates a new Filter from user-supplied origKeys func NewWithKeys(m uint64, origKeys []uint64) (f *Filter, err error) { var ( bits []uint64 keys []uint64 ) if bits, err = newBits(m); err != nil { return nil, err } if keys, err = newKeysCopy(origKeys); err != nil { return nil, err } return &Filter{ m: m, n: 0, bits: bits, keys: keys, }, nil } func newBits(m uint64) ([]uint64, error) { if m < MMin { return nil, fmt.Errorf("number of bits in the filter must be >= %d (was %d)", MMin, m) } return make([]uint64, (m+63)/64), nil } func newKeysCopy(origKeys []uint64) (keys []uint64, err error) { if len(origKeys) < KMin { return nil, fmt.Errorf("keys must have length %d or greater (was %d)", KMin, len(origKeys)) } if !uniqueKeys(origKeys) { return nil, fmt.Errorf("Bloom filter keys must be unique") } keys = append(keys, origKeys...) return keys, err } bloomfilter-2.0.3/v2/optimal_test.go000066400000000000000000000012301376764757200174400ustar00rootroot00000000000000package v2 import ( "testing" ) func TestOptimal(t *testing.T) { tests := []struct { n uint64 p float64 k, m uint64 }{ { n: 1000, p: 0.01 / 100, k: 14, m: 19171, }, { n: 10000, p: 0.01 / 100, k: 14, m: 191702, }, { n: 10000, p: 0.01 / 100, k: 14, m: 191702, }, { n: 1000, p: 0.001 / 100, k: 17, m: 23963, }, } for _, test := range tests { m := OptimalM(test.n, test.p) k := OptimalK(m, test.n) if k != test.k || m != test.m { t.Errorf( "n=%d p=%f: expected (m=%d, k=%d), got (m=%d, k=%d)", test.n, test.p, test.m, test.k, m, k, ) } } } bloomfilter-2.0.3/v2/statistics.go000066400000000000000000000021641376764757200171350ustar00rootroot00000000000000// Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package v2 import ( "math" "math/bits" ) // CountBitsUint64s count 1's in b func CountBitsUint64s(b []uint64) int { c := 0 for _, x := range b { c += bits.OnesCount64(x) } return c } // PreciseFilledRatio is an exhaustive count # of 1's func (f *Filter) PreciseFilledRatio() float64 { f.lock.RLock() defer f.lock.RUnlock() return float64(CountBitsUint64s(f.bits)) / float64(f.M()) } // N is how many elements have been inserted // (actually, how many Add()s have been performed?) func (f *Filter) N() uint64 { f.lock.RLock() defer f.lock.RUnlock() return f.n } // FalsePosititveProbability is the upper-bound probability of false positives // (1 - exp(-k*(n+0.5)/(m-1))) ** k func (f *Filter) FalsePosititveProbability() float64 { k := float64(f.K()) n := float64(f.N()) m := float64(f.M()) return math.Pow(1.0-math.Exp((-k)*(n+0.5)/(m-1)), k) }