pax_global_header00006660000000000000000000000064151535562700014523gustar00rootroot0000000000000052 comment=40dab66a8f9114ec575eb4eb7e340a5fc5267067 golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/000077500000000000000000000000001515355627000217265ustar00rootroot00000000000000golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/.travis.yml000066400000000000000000000000351515355627000240350ustar00rootroot00000000000000language: go go: - master golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/LICENSE000066400000000000000000000020631515355627000227340ustar00rootroot00000000000000MIT License Copyright (c) 2019 Øyvind Ingvaldsen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/README.md000066400000000000000000000065521515355627000232150ustar00rootroot00000000000000# Aho-Corasick [![Build Status](https://travis-ci.com/BobuSumisu/aho-corasick.svg?token=eGRFn5xdQ7p9yby3GVvc&branch=master)](https://travis-ci.com/BobuSumisu/aho-corasick) ![Go Version](https://img.shields.io/github/go-mod/go-version/BobuSumisu/aho-corasick) ![Latest Tag](https://img.shields.io/github/v/tag/BobuSumisu/aho-corasick) Implementation of the Aho-Corasick string-search algorithm in Go. Licensed under MIT License. ## Details This implementation does not use a [Double-Array Trie](https://linux.thai.net/~thep/datrie/datrie.html) as in my [implementation](https://github.com/BobuSumisu/go-ahocorasick) from a couple of years back. This reduces the build time drastically, but at the cost of higher memory consumption. The search time is still fast, and comparable to other Go implementations I have found on github that claims to be fast (see [performance](#Performance)). ## Documentation Can be found at [godoc.org](https://godoc.org/github.com/BobuSumisu/aho-corasick). ## Example Usage Use a `TrieBuilder` to build a `Trie`: ```go trie := NewTrieBuilder(). AddStrings([]string{"or", "amet"}). Build() ``` Then go and match something interesting: ```go matches := trie.MatchString("Lorem ipsum dolor sit amet, consectetur adipiscing elit.") fmt.Printf("Got %d matches.\n", len(matches)) // => Got 3 matches. ``` What did we match? ```go for _, match := range matches { fmt.Printf("Matched pattern %d %q at position %d.\n", match.Match(), match.Pattern(), match.Pos()) } // => Matched pattern 0 "or" at position 1. // => Matched pattern 0 "or" at position 15. // => Matched patterh 1 "amet" at position 22. ``` ## Building You can easily load patterns from file: ```go builder := NewTrieBuilder() builder.LoadPatterns("patterns.txt") builder.LoadStrings("strings.txt") ``` Both functions expects a text file with one pattern per line. `LoadPatterns` expects the pattern to be in hexadecimal form. ## Storing Use `Encode` to store a `Trie` in gzip compressed binary format: ```go f, err := os.Create("trie.gz") err := Encode(f, trie) ``` And `Decode` to load it from binary format: ```go f, err := os.Open("trie.gz") trie, err := Decode(f) ``` ## Performance Some simple benchmarking on my machine (Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz, 32 GiB RAM). Build and search time grows quite linearly with regards to number of patterns and input text length. ### Building BenchmarkTrieBuild/100-12 10000 0.1460 ms/op BenchmarkTrieBuild/1000-12 1000 2.1643 ms/op BenchmarkTrieBuild/10000-12 100 14.3305 ms/op BenchmarkTrieBuild/100000-12 10 131.2442 ms/op ### Searching BenchmarkMatchIbsen/100-12 2000000 0.0006 ms/op BenchmarkMatchIbsen/1000-12 300000 0.0042 ms/op BenchmarkMatchIbsen/10000-12 30000 0.0436 ms/op BenchmarkMatchIbsen/100000-12 3000 0.4310 ms/op ### Compared to Other Implementation See [aho-corasick-benchmark](https://github.com/Bobusumisu/aho-corasick-benchmark). ### Memory Usage As mentioned, the memory consumption will be quite high compared to a double-array trie implementation. Especially during the build phase (which currently contains a lot of object allocations). golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/builder.go000066400000000000000000000101171515355627000237030ustar00rootroot00000000000000package ahocorasick import ( "bufio" "encoding/hex" "os" "strings" ) type state struct { id int64 value byte parent *state trans map[byte]*state dict int64 failLink *state dictLink *state pattern int64 } // TrieBuilder is used to build Tries. type TrieBuilder struct { states []*state root *state numPatterns int64 } // NewTrieBuilder creates and initializes a new TrieBuilder. func NewTrieBuilder() *TrieBuilder { tb := &TrieBuilder{ states: make([]*state, 0), root: nil, numPatterns: 0, } tb.addState(0, nil) tb.addState(0, nil) tb.root = tb.states[1] return tb } func (tb *TrieBuilder) addState(value byte, parent *state) *state { s := &state{ id: int64(len(tb.states)), value: value, parent: parent, trans: make(map[byte]*state), dict: 0, failLink: nil, dictLink: nil, pattern: 0, } tb.states = append(tb.states, s) return s } // AddPattern adds a byte pattern to the Trie under construction. func (tb *TrieBuilder) AddPattern(pattern []byte) *TrieBuilder { s := tb.root var t *state var ok bool for _, c := range pattern { if t, ok = s.trans[c]; !ok { t = tb.addState(c, s) s.trans[c] = t } s = t } s.dict = int64(len(pattern)) s.pattern = tb.numPatterns tb.numPatterns++ return tb } // AddPatterns adds multiple byte patterns to the Trie. func (tb *TrieBuilder) AddPatterns(patterns [][]byte) *TrieBuilder { for _, pattern := range patterns { tb.AddPattern(pattern) } return tb } // AddString adds a string pattern to the Trie under construction. func (tb *TrieBuilder) AddString(pattern string) *TrieBuilder { return tb.AddPattern([]byte(pattern)) } // AddStrings add multiple strings to the Trie. func (tb *TrieBuilder) AddStrings(patterns []string) *TrieBuilder { for _, pattern := range patterns { tb.AddString(pattern) } return tb } // LoadPatterns loads byte patterns from a file. Expects one pattern per line in hexadecimal form. func (tb *TrieBuilder) LoadPatterns(path string) error { f, err := os.Open(path) if err != nil { return err } defer f.Close() s := bufio.NewScanner(f) for s.Scan() { str := strings.TrimSpace(s.Text()) if len(str) != 0 { pattern, err := hex.DecodeString(str) if err != nil { return err } tb.AddPattern(pattern) } } return s.Err() } // LoadStrings loads string patterns from a file. Expects one pattern per line. func (tb *TrieBuilder) LoadStrings(path string) error { f, err := os.Open(path) if err != nil { return err } defer f.Close() s := bufio.NewScanner(f) for s.Scan() { str := strings.TrimSpace(s.Text()) if len(str) != 0 { tb.AddString(str) } } return s.Err() } // Build constructs the Trie. func (tb *TrieBuilder) Build() *Trie { tb.computeFailLinks(tb.root) tb.computeDictLinks(tb.root) numStates := len(tb.states) dict := make([]int64, numStates) trans := make([][256]int64, numStates) failLink := make([]int64, numStates) dictLink := make([]int64, numStates) pattern := make([]int64, numStates) for i, s := range tb.states { dict[i] = s.dict pattern[i] = s.pattern for c, t := range s.trans { trans[i][c] = t.id } if s.failLink != nil { failLink[i] = s.failLink.id } if s.dictLink != nil { dictLink[i] = s.dictLink.id } } return &Trie{dict, trans, failLink, dictLink, pattern} } func (tb *TrieBuilder) computeFailLinks(s *state) { if s.failLink != nil { return } if s == tb.root || s.parent == tb.root { s.failLink = tb.root } else { var ok bool for t := s.parent.failLink; t != tb.root; t = t.failLink { if t.failLink == nil { tb.computeFailLinks(t) } if s.failLink, ok = t.trans[s.value]; ok { break } } if s.failLink == nil { if s.failLink, ok = tb.root.trans[s.value]; !ok { s.failLink = tb.root } } } for _, t := range s.trans { tb.computeFailLinks(t) } } func (tb *TrieBuilder) computeDictLinks(s *state) { if s != tb.root { for t := s.failLink; t != tb.root; t = t.failLink { if t.dict != 0 { s.dictLink = t break } } } for _, t := range s.trans { tb.computeDictLinks(t) } } golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/builder_test.go000066400000000000000000000020141515355627000247370ustar00rootroot00000000000000package ahocorasick import ( "fmt" "io/ioutil" "testing" ) func TestLoadStrings(t *testing.T) { tb := NewTrieBuilder() if err := tb.LoadStrings("doesnt-exists.txt"); err == nil { t.Errorf("should fail") } if err := tb.LoadStrings("./test_data/strings.txt"); err != nil { t.Error(err) } tr := tb.Build() ibsen, err := ioutil.ReadFile("./test_data/Ibsen.txt") if err != nil { t.Error(err) } expected := 697 ms := tr.Match(ibsen) if len(ms) != expected { for _, m := range ms { fmt.Println(m) } t.Errorf("expected %d matches, got %d\n", expected, len(ms)) } } func TestLoadPatterns(t *testing.T) { tb := NewTrieBuilder() if err := tb.LoadPatterns("./test_data/patterns.txt"); err != nil { t.Error(err) } tr := tb.Build() ibsen, err := ioutil.ReadFile("./test_data/Ibsen.txt") if err != nil { t.Error(err) } expected := 697 ms := tr.Match(ibsen) if len(ms) != expected { for _, m := range ms { fmt.Println(m) } t.Errorf("expected %d matches, got %d\n", expected, len(ms)) } } golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/go.mod000066400000000000000000000000631515355627000230330ustar00rootroot00000000000000module github.com/BobuSumisu/aho-corasick go 1.14 golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/go.sum000066400000000000000000000000011515355627000230500ustar00rootroot00000000000000 golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/match.go000066400000000000000000000021451515355627000233530ustar00rootroot00000000000000package ahocorasick import ( "bytes" "fmt" ) // Match represents a matched pattern in the input. type Match struct { pos int64 pattern int64 match []byte } func newMatch(pos, pattern int64, match []byte) *Match { return &Match{pos, pattern, match} } func newMatchString(pos, pattern int64, match string) *Match { return &Match{pos: pos, pattern: pattern, match: []byte(match)} } func (m *Match) String() string { return fmt.Sprintf("{%d %d %q}", m.pos, m.pattern, m.match) } // Pos returns the byte position of the match. func (m *Match) Pos() int64 { return m.pos } // Pattern returns the pattern id of the match. func (m *Match) Pattern() int64 { return m.pattern } // Match returns the pattern matched. func (m *Match) Match() []byte { return m.match } // MatchString returns the pattern matched as a string. func (m *Match) MatchString() string { return string(m.match) } // MatchEqual check whether two matches are equal (i.e. at same position, pattern and same pattern). func MatchEqual(a, b *Match) bool { return a.pos == b.pos && a.pattern == b.pattern && bytes.Equal(a.match, b.match) } golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/stream.go000066400000000000000000000057771515355627000235700ustar00rootroot00000000000000package ahocorasick import ( "compress/gzip" "encoding/binary" "io" ) // Encode writes a Trie to w in gzip compressed binary format. func Encode(w io.Writer, trie *Trie) error { enc := newEncoder(w) return enc.encode(trie) } // Decode reads a Trie in gzip compressed binary format from r. func Decode(r io.Reader) (*Trie, error) { dec := newDecoder(r) return dec.decode() } type encoder struct { w io.Writer } func newEncoder(w io.Writer) *encoder { return &encoder{ w, } } func (enc *encoder) encode(trie *Trie) error { w := gzip.NewWriter(enc.w) defer w.Close() if err := binary.Write(w, binary.LittleEndian, uint64(len(trie.dict))); err != nil { return err } if err := binary.Write(w, binary.LittleEndian, uint64(len(trie.trans))); err != nil { return err } if err := binary.Write(w, binary.LittleEndian, uint64(len(trie.failLink))); err != nil { return err } if err := binary.Write(w, binary.LittleEndian, uint64(len(trie.dictLink))); err != nil { return err } if err := binary.Write(w, binary.LittleEndian, uint64(len(trie.pattern))); err != nil { return err } if err := binary.Write(w, binary.LittleEndian, trie.dict); err != nil { return err } if err := binary.Write(w, binary.LittleEndian, trie.trans); err != nil { return err } if err := binary.Write(w, binary.LittleEndian, trie.failLink); err != nil { return err } if err := binary.Write(w, binary.LittleEndian, trie.dictLink); err != nil { return err } if err := binary.Write(w, binary.LittleEndian, trie.pattern); err != nil { return err } return nil } type decoder struct { r io.Reader } func newDecoder(r io.Reader) *decoder { return &decoder{ r, } } func (dec *decoder) decode() (*Trie, error) { r, err := gzip.NewReader(dec.r) if err != nil { return nil, err } var dictLen, transLen, dictLinkLen, failLinkLen, patternLen uint64 if err := binary.Read(r, binary.LittleEndian, &dictLen); err != nil { return nil, err } if err := binary.Read(r, binary.LittleEndian, &transLen); err != nil { return nil, err } if err := binary.Read(r, binary.LittleEndian, &dictLinkLen); err != nil { return nil, err } if err := binary.Read(r, binary.LittleEndian, &failLinkLen); err != nil { return nil, err } if err := binary.Read(r, binary.LittleEndian, &patternLen); err != nil { return nil, err } dict := make([]int64, dictLen) if err := binary.Read(r, binary.LittleEndian, dict); err != nil { return nil, err } trans := make([][256]int64, transLen) if err := binary.Read(r, binary.LittleEndian, trans); err != nil { return nil, err } failLink := make([]int64, failLinkLen) if err := binary.Read(r, binary.LittleEndian, failLink); err != nil { return nil, err } dictLink := make([]int64, dictLinkLen) if err := binary.Read(r, binary.LittleEndian, dictLink); err != nil { return nil, err } pattern := make([]int64, patternLen) if err := binary.Read(r, binary.LittleEndian, pattern); err != nil { return nil, err } return &Trie{dict, trans, failLink, dictLink, pattern}, nil } golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/stream_test.go000066400000000000000000000030611515355627000246070ustar00rootroot00000000000000package ahocorasick import ( "bytes" "fmt" "os" "testing" ) func testTrie(trie *Trie) error { matches := trie.MatchString("Lorem ipsum dolor sit amet, consectetur adipiscing elit.") expected := []*Match{ newMatchString(1, 0, "or"), newMatchString(15, 0, "or"), newMatchString(22, 1, "amet"), } if len(expected) != len(matches) { return fmt.Errorf("expected %d matches, got %d\n", len(expected), len(matches)) } for i := range matches { if !MatchEqual(expected[i], matches[i]) { return fmt.Errorf("expected %v, got %v\n", expected[i], matches[i]) } } return nil } func TestEncodingAndDecoding(t *testing.T) { trie := NewTrieBuilder().AddStrings([]string{"or", "amet"}).Build() if err := testTrie(trie); err != nil { t.Error(err) } var buf bytes.Buffer if err := Encode(&buf, trie); err != nil { t.Error(err) } decodedTrie, err := Decode(&buf) if err != nil { t.Error(err) } if err := testTrie(decodedTrie); err != nil { t.Error(err) } } func TestReadAndWriteTrie(t *testing.T) { patterns, err := readPatterns("test_data/NSF-ordlisten.cleaned.uniq.txt") if err != nil { t.Fatal(err) } trie := NewTrieBuilder().AddStrings(patterns[:10000]).Build() f, err := os.Create("test.trie") if err != nil { t.Fatal(err) } defer os.Remove("test.trie") if err := Encode(f, trie); err != nil { t.Fatal(err) } f.Seek(0, 0) decodedTrie, err := Decode(f) if err != nil { t.Fatal(err) } matches := decodedTrie.MatchString("abasien") if len(matches) != 3 { t.Errorf("expected 3 matches, got %d", len(matches)) } } golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/test_data/000077500000000000000000000000001515355627000236765ustar00rootroot00000000000000golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/test_data/patterns.txt000066400000000000000000000002511515355627000262750ustar00rootroot00000000000000486564766967 456b64616c 486a616c6d6172 47696e61 48c3a56b6f6e 47726567657273 53c3b8726279 52656c6c696e67 4d6f6c76696b 50657474657273656e 4a656e73656e 42616c6c65 466c6f72 golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/test_data/strings.txt000066400000000000000000000001331515355627000261250ustar00rootroot00000000000000Hedvig Ekdal Hjalmar Gina Håkon Gregers Sørby Relling Molvik Pettersen Jensen Balle Flor golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/trie.go000066400000000000000000000043361515355627000232260ustar00rootroot00000000000000package ahocorasick const ( rootState int64 = 1 nilState int64 = 0 ) // Trie represents a trie of patterns with extra links as per the Aho-Corasick algorithm. type Trie struct { dict []int64 trans [][256]int64 failLink []int64 dictLink []int64 pattern []int64 } // Walk calls this function on any match, giving the end position, length of the matched bytes, // and the pattern number. type WalkFn func(end, n, pattern int64) bool // Walk runs the algorithm on a given output, calling the supplied callback function on every // match. The algorithm will terminate if the callback function returns false. func (tr *Trie) Walk(input []byte, fn WalkFn) { s := rootState for i, c := range input { t := tr.trans[s][c] if t == nilState { for u := tr.failLink[s]; u != rootState; u = tr.failLink[u] { if t = tr.trans[u][c]; t != nilState { break } } if t == nilState { if t = tr.trans[rootState][c]; t == nilState { t = rootState } } } s = t if tr.dict[s] != 0 { if !fn(int64(i), tr.dict[s], tr.pattern[s]) { return } } if tr.dictLink[s] != nilState { for u := tr.dictLink[s]; u != nilState; u = tr.dictLink[u] { if !fn(int64(i), tr.dict[u], tr.pattern[u]) { return } } } } } // Match runs the Aho-Corasick string-search algorithm on a byte input. func (tr *Trie) Match(input []byte) []*Match { matches := make([]*Match, 0) tr.Walk(input, func(end, n, pattern int64) bool { pos := end - n + 1 matches = append(matches, newMatch(pos, pattern, input[pos:pos+n])) return true }) return matches } // MatchFirst is the same as Match, but returns after first successful match. func (tr *Trie) MatchFirst(input []byte) *Match { var match *Match tr.Walk(input, func(end, n, pattern int64) bool { pos := end - n + 1 match = &Match{pos: pos, match: input[pos : pos+n]} return false }) return match } // MatchString runs the Aho-Corasick string-search algorithm on a string input. func (tr *Trie) MatchString(input string) []*Match { return tr.Match([]byte(input)) } // MatchFirstString is the same as MatchString, but returns after first successful match. func (tr *Trie) MatchFirstString(input string) *Match { return tr.MatchFirst([]byte(input)) } golang-github-bobusumisu-aho-corasick-1.0.3+dfsg/trie_test.go000066400000000000000000000135721515355627000242670ustar00rootroot00000000000000package ahocorasick import ( "bufio" "fmt" "io/ioutil" "os" "strings" "testing" ) func TestReadme(t *testing.T) { trie := NewTrieBuilder().AddStrings([]string{"or", "amet"}).Build() matches := trie.MatchString("Lorem ipsum dolor sit amet, consectetur adipiscing elit.") expected := []*Match{ newMatchString(1, 0, "or"), newMatchString(15, 0, "or"), newMatchString(22, 1, "amet"), } if len(expected) != len(matches) { t.Errorf("expected %d matches, got %d\n", len(expected), len(matches)) } for i := range matches { if !MatchEqual(expected[i], matches[i]) { t.Errorf("expected %v, got %v\n", expected[i], matches[i]) } } } func TestTrie(t *testing.T) { cases := []struct { name string patterns []string input string expected []*Match }{ { "Wikipedia", []string{"a", "ab", "bab", "bc", "bca", "c", "caa"}, "abccab", []*Match{ newMatchString(0, 0, "a"), newMatchString(0, 1, "ab"), newMatchString(1, 3, "bc"), newMatchString(2, 5, "c"), newMatchString(3, 5, "c"), newMatchString(4, 0, "a"), newMatchString(4, 1, "ab"), }, }, { "Prefix", []string{"Aho-Corasick", "Aho-Cora", "Aho", "A"}, "Aho-Corasick", []*Match{ newMatchString(0, 3, "A"), newMatchString(0, 2, "Aho"), newMatchString(0, 1, "Aho-Cora"), newMatchString(0, 0, "Aho-Corasick"), }, }, { "Suffix", []string{"Aho-Corasick", "Corasick", "sick", "k"}, "Aho-Corasick", []*Match{ newMatchString(0, 0, "Aho-Corasick"), newMatchString(4, 1, "Corasick"), newMatchString(8, 2, "sick"), newMatchString(11, 3, "k"), }, }, { "Infix", []string{"Aho-Corasick", "ho-Corasi", "o-Co", "-"}, "Aho-Corasick", []*Match{ newMatchString(3, 3, "-"), newMatchString(2, 2, "o-Co"), newMatchString(1, 1, "ho-Corasi"), newMatchString(0, 0, "Aho-Corasick"), }, }, { "Overlap", []string{"Aho-Co", "ho-Cora", "o-Coras", "-Corasick"}, "Aho-Corasick", []*Match{ newMatchString(0, 0, "Aho-Co"), newMatchString(1, 1, "ho-Cora"), newMatchString(2, 2, "o-Coras"), newMatchString(3, 3, "-Corasick"), }, }, { "Adjacent", []string{"Ah", "o-Co", "ras", "ick"}, "Aho-Corasick", []*Match{ newMatchString(0, 0, "Ah"), newMatchString(2, 1, "o-Co"), newMatchString(6, 2, "ras"), newMatchString(9, 3, "ick"), }, }, { "SingleSymbol", []string{"o"}, "Aho-Corasick", []*Match{ newMatchString(2, 0, "o"), newMatchString(5, 0, "o"), }, }, { "NoMatch", []string{"Gazorpazopfield", "Knuth", "O"}, "Aho-Corasick", []*Match{}, }, { "Zeroes", []string{"\x00\x00"}, "\x00\x00Aho\x00\x00-\x00\x00Corasick\x00\x00", []*Match{ newMatchString(0, 0, "\x00\x00"), newMatchString(5, 0, "\x00\x00"), newMatchString(8, 0, "\x00\x00"), newMatchString(18, 0, "\x00\x00"), }, }, { "Alphabetsize", []string{"\xff\xff"}, "\xff\xffAho\xfe\xfe-\xff\xffCorasick\xff\xff\xff", []*Match{ newMatchString(0, 0, "\xff\xff"), newMatchString(8, 0, "\xff\xff"), newMatchString(18, 0, "\xff\xff"), newMatchString(19, 0, "\xff\xff"), }, }, } for _, c := range cases { tr := NewTrieBuilder().AddStrings(c.patterns).Build() matches := tr.MatchString(c.input) if len(matches) != len(c.expected) { t.Errorf("%s: expected %d matches, got %d", c.name, len(c.expected), len(matches)) continue } for i := range matches { if !MatchEqual(matches[i], c.expected[i]) { t.Errorf("%s: expected %v, got %v", c.name, c.expected[i], matches[i]) } } } } func TestMatchFirst(t *testing.T) { ibsen, err := ioutil.ReadFile("./test_data/Ibsen.txt") if err != nil { t.Error(err) } tr := NewTrieBuilder().AddString("Hedvig").Build() match := tr.MatchFirst(ibsen) expected := newMatchString(937, 0, "Hedvig") if !MatchEqual(expected, match) { t.Errorf("expected %v, got %v\n", expected, match) } } func TestHedvig(t *testing.T) { ibsen, err := ioutil.ReadFile("./test_data/Ibsen.txt") if err != nil { t.Error(err) } matches := NewTrieBuilder().AddString("Hedvig").Build().Match(ibsen) if len(matches) != 134 { fmt.Printf("expected to find 134 Hedvig's, got %d\n", len(matches)) } } func BenchmarkTrieBuild(b *testing.B) { patterns, err := readPatterns("./test_data/NSF-ordlisten.cleaned.txt") if err != nil { b.Error(err) } b.Run("100", func(b *testing.B) { for n := 0; n < b.N; n++ { NewTrieBuilder().AddStrings(patterns[:100]).Build() } }) b.Run("1000", func(b *testing.B) { for n := 0; n < b.N; n++ { NewTrieBuilder().AddStrings(patterns[:1000]).Build() } }) b.Run("10000", func(b *testing.B) { for n := 0; n < b.N; n++ { NewTrieBuilder().AddStrings(patterns[:10000]).Build() } }) b.Run("100000", func(b *testing.B) { for n := 0; n < b.N; n++ { NewTrieBuilder().AddStrings(patterns[:100000]).Build() } }) } func BenchmarkMatchIbsen(b *testing.B) { patterns, err := readPatterns("./test_data/NSF-ordlisten.cleaned.txt") if err != nil { b.Error(err) } ibsen, err := ioutil.ReadFile("./test_data/Ibsen.txt") if err != nil { b.Error(err) } trie := NewTrieBuilder().AddStrings(patterns[:10000]).Build() b.Run("100", func(b *testing.B) { for n := 0; n < b.N; n++ { trie.Match(ibsen[:100]) } }) b.Run("1000", func(b *testing.B) { for n := 0; n < b.N; n++ { trie.Match(ibsen[:1000]) } }) b.Run("10000", func(b *testing.B) { for n := 0; n < b.N; n++ { trie.Match(ibsen[:10000]) } }) b.Run("100000", func(b *testing.B) { for n := 0; n < b.N; n++ { trie.Match(ibsen[:100000]) } }) } func readPatterns(path string) ([]string, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() s := bufio.NewScanner(f) patterns := make([]string, 0) for s.Scan() { patterns = append(patterns, strings.TrimSpace(s.Text())) } if err := s.Err(); err != nil { return nil, err } return patterns, nil }