bwt-0.6.1/0000775000175000017510000000000014277613324011654 5ustar nileshnileshbwt-0.6.1/util.go0000664000175000017510000000046014277613324013160 0ustar nileshnileshpackage bwt import "bytes" // SliceOfByteSlice is [][]byte type SliceOfByteSlice [][]byte func (s SliceOfByteSlice) Len() int { return len(s) } func (s SliceOfByteSlice) Less(i, j int) bool { return bytes.Compare(s[i], s[j]) < 0 } func (s SliceOfByteSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } bwt-0.6.1/go.sum0000664000175000017510000000000014277613324012775 0ustar nileshnileshbwt-0.6.1/go.mod0000664000175000017510000000005214277613324012757 0ustar nileshnileshmodule github.com/shenwei356/bwt go 1.17 bwt-0.6.1/fmi/0000775000175000017510000000000014277613324012427 5ustar nileshnileshbwt-0.6.1/fmi/util.go0000664000175000017510000000077214277613324013741 0ustar nileshnileshpackage fmi type sMatch struct { query []byte start, end int mismatches int } // Stack struct type Stack []sMatch // Empty tell if it is empty func (s Stack) Empty() bool { return len(s) == 0 } // Peek return the last element func (s Stack) Peek() sMatch { return s[len(s)-1] } // Put puts element to stack func (s *Stack) Put(i sMatch) { (*s) = append((*s), i) } // Pop pops element from the stack func (s *Stack) Pop() sMatch { d := (*s)[len(*s)-1] (*s) = (*s)[:len(*s)-1] return d } bwt-0.6.1/fmi/fmi_test.go0000664000175000017510000000453414277613324014576 0ustar nileshnileshpackage fmi import ( "testing" "github.com/shenwei356/bwt" ) type Case struct { s, q string m int r []int } var cases = []Case{ {"", "abc", 0, []int{}}, {"mississippi", "", 0, []int{}}, {"mississippi", "iss", 0, []int{1, 4}}, {"abcabcabc", "abc", 0, []int{0, 3, 6}}, {"abcabcabc", "gef", 0, []int{}}, {"abcabcabc", "gef", 0, []int{}}, {"abcabcabc", "xef", 0, []int{}}, {"abcabcabc", "xabcb", 1, []int{}}, {"abcabcabc", "xabcb", 2, []int{2}}, {"abcabd", "abc", 1, []int{0, 3}}, {"acctatac", "ac", 0, []int{0, 6}}, {"acctatac", "tac", 0, []int{5}}, {"acctatac", "tac", 1, []int{3, 5}}, {"acctatac", "taz", 1, []int{3, 5}}, {"ccctatac", "tzc", 1, []int{5}}, {"acctatac", "atac", 0, []int{4}}, {"acctatac", "acctatac", 0, []int{0}}, {"acctatac", "acctatac", 1, []int{0}}, {"acctatac", "cctatac", 1, []int{1}}, {"acctatac", "caa", 2, []int{1, 2, 3, 4, 5}}, {"acctatac", "caa", 3, []int{0, 1, 2, 3, 4, 5}}, } func TestLocate(t *testing.T) { var err error var match bool var fmi *FMIndex for i, c := range cases { fmi = NewFMIndex() _, err = fmi.Transform([]byte(c.s)) if err != nil { if c.s == "" && err == bwt.ErrEmptySeq { continue } else { t.Errorf("case #%d: Transform: %s", i+1, err) return } } match, err = fmi.Match([]byte(c.q), c.m) if err != nil { t.Errorf("case #%d: Locate: %s", i, err) return } if match != (len(c.r) > 0) { t.Errorf("case #%d: Match '%s' in '%s' (allow %d mismatch), result: %v. right answer: %v", i+1, c.q, c.s, c.m, match, len(c.r) > 0) return } } } func TestMatch(t *testing.T) { var err error var loc []int var fmi *FMIndex for i, c := range cases { fmi = NewFMIndex() _, err = fmi.Transform([]byte(c.s)) if err != nil { if c.s == "" && err == bwt.ErrEmptySeq { continue } else { t.Errorf("case #%d: Transform: %s", i+1, err) return } } loc, err = fmi.Locate([]byte(c.q), c.m) if err != nil { t.Errorf("case #%d: Locate: %s", i, err) return } if len(loc) != len(c.r) { t.Errorf("case #%d: Locate '%s' in '%s' (allow %d mismatch), result: %d. right answer: %d", i+1, c.q, c.s, c.m, loc, c.r) return } for j := 0; j < len(loc); j++ { if loc[j] != c.r[j] { t.Errorf("case #%d: Locate '%s' in '%s' (allow %d mismatch), result: %d. right answer: %d", i+1, c.q, c.s, c.m, loc, c.r) return } } } } bwt-0.6.1/fmi/fmi.go0000664000175000017510000002434014277613324013534 0ustar nileshnileshpackage fmi import ( "bytes" "fmt" "sort" "strings" "github.com/shenwei356/bwt" ) // FMIndex is Burrows-Wheeler Index type FMIndex struct { // EndSymbol EndSymbol byte // SuffixArray SuffixArray []int // Burrows-Wheeler Transform BWT []byte // First column of BWM F []byte // Alphabet in the BWT Alphabet []byte // Count of Letters in Alphabet. // CountOfLetters map[byte]int CountOfLetters []int // slice is faster han map // C[c] is a table that, for each character c in the alphabet, // contains the number of occurrences of lexically smaller characters // in the text. // C map[byte]int C []int // slice is faster han map // Occ(c, k) is the number of occurrences of character c in the // prefix L[1..k], k is 0-based. // Occ map[byte]*[]int32 Occ []*[]int32 // slice is faster han map } // NewFMIndex is constructor of FMIndex func NewFMIndex() *FMIndex { fmi := new(FMIndex) fmi.EndSymbol = byte(0) return fmi } // Transform return Burrows-Wheeler-Transform of s func (fmi *FMIndex) Transform(s []byte) ([]byte, error) { if len(s) == 0 { return nil, bwt.ErrEmptySeq } var err error sa := bwt.SuffixArray(s) fmi.SuffixArray = sa fmi.BWT, err = bwt.FromSuffixArray(s, fmi.SuffixArray, fmi.EndSymbol) if err != nil { return nil, err } F := make([]byte, len(s)+1) F[0] = fmi.EndSymbol for i := 1; i <= len(s); i++ { F[i] = s[sa[i]] } fmi.F = F // fmi.CountOfLetters = byteutil.CountOfByte(fmi.BWT) // delete(fmi.CountOfLetters, fmi.EndSymbol) count := make([]int, 128) for _, b := range fmi.BWT { count[b]++ } count[fmi.EndSymbol] = 0 fmi.CountOfLetters = count // fmi.Alphabet = byteutil.AlphabetFromCountOfByte(fmi.CountOfLetters) alphabet := make([]byte, 0, 128) for b, c := range count { if c > 0 { alphabet = append(alphabet, byte(b)) } } fmi.Alphabet = alphabet fmi.C = computeC(fmi.F) fmi.Occ = computeOccurrence(fmi.BWT, fmi.Alphabet) return fmi.BWT, nil } // Last2First mapping func (fmi *FMIndex) Last2First(i int) int { c := fmi.BWT[i] return fmi.C[c] + int((*fmi.Occ[c])[i]) } func (fmi *FMIndex) nextLetterInAlphabet(c byte) byte { var nextLetter byte for i, letter := range fmi.Alphabet { if letter == c { if i < len(fmi.Alphabet)-1 { nextLetter = fmi.Alphabet[i+1] } else { nextLetter = fmi.Alphabet[i] } break } } return nextLetter } // Locate locates the pattern func (fmi *FMIndex) Locate(query []byte, mismatches int) ([]int, error) { if len(query) == 0 { return []int{}, nil } var locations []int locationsMap := make(map[int]struct{}) if mismatches == 0 { // letters := byteutil.Alphabet(query) count := make([]int, 128) for _, b := range query { if count[b] == 0 { count[b]++ } } letters := make([]byte, 0, 128) for b, c := range count { if c > 0 { letters = append(letters, byte(b)) } } for _, letter := range letters { // query having letter not in alphabet // if _, ok := fmi.CountOfLetters[letter]; !ok { if fmi.CountOfLetters[letter] == 0 { return locations, nil } } } n := len(fmi.BWT) var matches Stack // start and end are 0-based matches.Put(sMatch{query: query, start: 0, end: n - 1, mismatches: mismatches}) // fmt.Printf("====%s====\n", query) // fmt.Println(fmi) var match sMatch var last, c byte var start, end int var m int var letters []byte // var ok bool for !matches.Empty() { match = matches.Pop() query = match.query[0 : len(match.query)-1] last = match.query[len(match.query)-1] if match.mismatches == 0 { letters = []byte{last} } else { letters = fmi.Alphabet } // fmt.Println("\n--------------------------------------------") // fmt.Printf("%s, %s, %c\n", match.query, query, last) // fmt.Printf("query: %s, last: %c\n", query, last) for _, c = range letters { // if _, ok = fmi.CountOfLetters[c]; !ok { // letter not in alphabet if fmi.CountOfLetters[c] == 0 { continue } // fmt.Printf("letter: %c, start: %d, end: %d, mismatches: %d\n", c, match.start, match.end, match.mismatches) if match.start == 0 { start = fmi.C[c] + 0 } else { start = fmi.C[c] + int((*fmi.Occ[c])[match.start-1]) } end = fmi.C[c] + int((*fmi.Occ[c])[match.end]-1) // fmt.Printf(" s: %d, e: %d\n", start, end) if start > end { continue } if len(query) == 0 { for _, i := range fmi.SuffixArray[start : end+1] { // fmt.Printf(" >>> found: %d\n", i) locationsMap[i] = struct{}{} } } else { m = match.mismatches if c != last { if match.mismatches > 1 { m = match.mismatches - 1 } else { m = 0 } } // fmt.Printf(" >>> candidate: query: %s, start: %d, end: %d, m: %d\n", query, start, end, m) matches.Put(sMatch{query: query, start: start, end: end, mismatches: m}) } } } i := 0 locations = make([]int, len(locationsMap)) for loc := range locationsMap { locations[i] = loc i++ } sort.Ints(locations) return locations, nil } // Match is a simple version of Locate, which returns immediately for a match. func (fmi *FMIndex) Match(query []byte, mismatches int) (bool, error) { if len(query) == 0 { return false, nil } if mismatches == 0 { // letters := byteutil.Alphabet(query) count := make([]int, 128) for _, b := range query { if count[b] == 0 { count[b]++ } } letters := make([]byte, 0, 128) for b, c := range count { if c > 0 { letters = append(letters, byte(b)) } } for _, letter := range letters { // query having letter not in alphabet // if _, ok := fmi.CountOfLetters[letter]; !ok { if fmi.CountOfLetters[letter] == 0 { return false, nil } } } n := len(fmi.BWT) var matches Stack // start and end are 0-based matches.Put(sMatch{query: query, start: 0, end: n - 1, mismatches: mismatches}) // fmt.Printf("====%s====\n", query) // fmt.Println(fmi) var match sMatch var last, c byte var start, end int var m int var letters []byte // var ok bool for !matches.Empty() { match = matches.Pop() query = match.query[0 : len(match.query)-1] last = match.query[len(match.query)-1] if match.mismatches == 0 { letters = []byte{last} } else { letters = fmi.Alphabet } // fmt.Println("\n--------------------------------------------") // fmt.Printf("%s, %s, %c\n", match.query, query, last) // fmt.Printf("query: %s, last: %c\n", query, last) for _, c = range letters { // if _, ok = fmi.CountOfLetters[c]; !ok { // letter not in alphabet if fmi.CountOfLetters[c] == 0 { continue } // fmt.Printf("letter: %c, start: %d, end: %d, mismatches: %d\n", c, match.start, match.end, match.mismatches) if match.start == 0 { start = fmi.C[c] + 0 } else { start = fmi.C[c] + int((*fmi.Occ[c])[match.start-1]) } end = fmi.C[c] + int((*fmi.Occ[c])[match.end]-1) // fmt.Printf(" s: %d, e: %d\n", start, end) if start > end { continue } if len(query) == 0 { return true, nil } else { m = match.mismatches if c != last { if match.mismatches > 1 { m = match.mismatches - 1 } else { m = 0 } } // fmt.Printf(" >>> candidate: query: %s, start: %d, end: %d, m: %d\n", query, start, end, m) matches.Put(sMatch{query: query, start: start, end: end, mismatches: m}) } } } return false, nil } func (fmi *FMIndex) String() string { var buffer bytes.Buffer buffer.WriteString(fmt.Sprintf("EndSymbol: %c\n", fmi.EndSymbol)) buffer.WriteString(fmt.Sprintf("BWT: %s\n", string(fmi.BWT))) buffer.WriteString(fmt.Sprintf("Alphabet: %s\n", string(fmi.Alphabet))) buffer.WriteString("F:\n") buffer.WriteString(string(fmi.F) + "\n") buffer.WriteString("C:\n") for _, letter := range fmi.Alphabet { buffer.WriteString(fmt.Sprintf(" %c: %d\n", letter, fmi.C[letter])) } buffer.WriteString("Occ:\n") buffer.WriteString(fmt.Sprintf(" BWT[%s]\n", strings.Join(strings.Split(string(fmi.BWT), ""), " "))) for _, letter := range fmi.Alphabet { buffer.WriteString(fmt.Sprintf(" %c: %v\n", letter, fmi.Occ[letter])) } buffer.WriteString("SA:\n") buffer.WriteString(fmt.Sprintf(" %d\n", fmi.SuffixArray)) return buffer.String() } // ComputeC computes C. // C[c] is a table that, for each character c in the alphabet, // contains the number of occurrences of lexically smaller characters // in the text. // // func ComputeC(L []byte, alphabet []byte) map[byte]int { // if alphabet == nil { // alphabet = byteutil.Alphabet(L) // } // C := make(map[byte]int, len(alphabet)) // count := 0 // for _, c := range L { // if _, ok := C[c]; !ok { // C[c] = count // } // count++ // } // return C // } func computeC(L []byte) []int { C := make([]int, 128) count := 0 for _, c := range L { if C[c] == 0 { C[c] = count } count++ } return C } // ComputeOccurrence returns occurrence information. // Occ(c, k) is the number of occurrences of character c in the prefix L[1..k] // // func ComputeOccurrence(bwt []byte, letters []byte) map[byte]*[]int32 { // if letters == nil { // letters = byteutil.Alphabet(bwt) // } // occ := make(map[byte]*[]int32, len(letters)-1) // for _, letter := range letters { // t := make([]int32, 1, len(bwt)) // t[0] = 0 // occ[letter] = &t // } // t := make([]int32, 1, len(bwt)) // t[0] = 1 // occ[bwt[0]] = &t // var letter, k byte // var v *[]int32 // for _, letter = range bwt[1:] { // for k, v = range occ { // if k == letter { // *v = append(*v, (*v)[len(*v)-1]+1) // } else { // *v = append(*v, (*v)[len(*v)-1]) // } // } // } // return occ // } func computeOccurrence(bwt []byte, letters []byte) []*[]int32 { if letters == nil { count := make([]int, 128) for _, b := range bwt { if count[b] == 0 { count[b]++ } } letters = make([]byte, 0, 128) for b, c := range count { if c > 0 { letters = append(letters, byte(b)) } } } occ := make([]*[]int32, 128) for _, letter := range letters { t := make([]int32, 1, len(bwt)) t[0] = 0 occ[letter] = &t } t := make([]int32, 1, len(bwt)) t[0] = 1 occ[bwt[0]] = &t var letter byte var k, letterInt int var v *[]int32 for _, letter = range bwt[1:] { letterInt = int(letter) for k, v = range occ { if v == nil { continue } if k == letterInt { *v = append(*v, (*v)[len(*v)-1]+1) } else { *v = append(*v, (*v)[len(*v)-1]) } } } return occ } bwt-0.6.1/bwt_test.go0000664000175000017510000000363114277613324014041 0ustar nileshnileshpackage bwt import ( "fmt" "math/rand" "testing" ) func TestTransformAndInverseTransform(t *testing.T) { s := "abracadabra" trans := "ard$rcaaaabb" tr, err := Transform([]byte(s), '$') if err != nil { t.Error(err) } if string(tr) != trans { t.Error("Test failed: Transform") } if string(InverseTransform([]byte(trans), '$')) != s { t.Error("Test failed: InverseTransform") } } func TestFromSuffixArray(t *testing.T) { s := "GATGCGAGAGATG" trans := "GGGGGGTCAA$TAA" sa := SuffixArray([]byte(s)) B, err := FromSuffixArray([]byte(s), sa, '$') if err != nil { t.Error("Test failed: FromSuffixArray error") } if string(B) != trans { t.Error("Test failed: FromSuffixArray returns wrong result") } } func TestFromSuffixArrayEmptySeq(t *testing.T) { s := "" sa := SuffixArray([]byte(s)) _, err := FromSuffixArray([]byte(s), sa, '$') if err == nil || err != ErrEmptySeq { t.Error("Test failed: FromSuffixArray error") } } func TestSA(t *testing.T) { s := "mississippi" sa := SuffixArray([]byte(s)) sa1 := []int{11, 10, 7, 4, 1, 0, 9, 8, 6, 3, 5, 2} // fmt.Printf("%s\nanswer: %v, result: %v", s, sa1, sa) if len(sa) != len(sa1) { t.Error(fmt.Errorf("sa error. answer: %v, result: %v", sa1, sa)) return } for i := range sa { if sa[i] != sa1[i] { t.Error(fmt.Errorf("sa error. answer: %v, result: %v", sa1, sa)) return } } } var cases [][]byte func init() { rand.Seed(1) alphabet := "ACGT" n := len(alphabet) scales := []float32{1e3, 1e5} cases = make([][]byte, len(scales)) for i, scale := range scales { l := rand.Float32() * scale * 10 buf := make([]byte, int(l)) for j := 0; j < int(l); j++ { buf[j] = alphabet[rand.Intn(n)] } cases[i] = buf } } var result []byte func BenchmarkTransform(t *testing.B) { var r []byte var err error for i := 0; i < t.N; i++ { r, err = Transform(cases[0], '$') if err != nil { t.Error(err) return } } result = r } bwt-0.6.1/bwt.go0000664000175000017510000000610414277613324013000 0ustar nileshnileshpackage bwt import ( "errors" "index/suffixarray" "reflect" "sort" ) // CheckEndSymbol is a global variable for checking end symbol before Burrows–Wheeler transform var CheckEndSymbol = true // ErrEndSymbolExisted means you should choose another EndSymbol var ErrEndSymbolExisted = errors.New("bwt: end-symbol existed in string") // ErrEmptySeq means a empty sequence is given var ErrEmptySeq = errors.New("bwt: empty sequence") // Transform returns Burrows–Wheeler transform of a byte slice. // See https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform func Transform(s []byte, es byte) ([]byte, error) { if len(s) == 0 { return nil, ErrEmptySeq } if CheckEndSymbol { for _, c := range s { if c == es { return nil, ErrEndSymbolExisted } } } sa := SuffixArray(s) bwt, err := FromSuffixArray(s, sa, es) return bwt, err } // InverseTransform reverses the bwt to original byte slice. Not optimized yet. func InverseTransform(t []byte, es byte) []byte { n := len(t) lines := make([][]byte, n) for i := 0; i < n; i++ { lines[i] = make([]byte, n) } for i := 0; i < n; i++ { for j := 0; j < n; j++ { lines[j][n-1-i] = t[j] } sort.Sort(SliceOfByteSlice(lines)) } s := make([]byte, n-1) for _, line := range lines { if line[n-1] == es { s = line[0 : n-1] break } } return s } // SuffixArray returns the suffix array of s. // This function is the performance bottleneck of bwt and bwt/fmi package, with O(nlogn). func SuffixArray(s []byte) []int { // sa := make([]int, len(s)+1) // sa[0] = len(s) // for i := 0; i < len(s); i++ { // sa[i+1] = i // } // sort.Slice(sa[1:], func(i, j int) bool { // return bytes.Compare(s[sa[i+1]:], s[sa[j+1]:]) < 0 // }) // return sa // https://github.com/shenwei356/bwt/issues/3 . // nearly copy from https://github.com/crazyleg/burrow-wheelers-golang/blob/master/pkg/bwtgolang/suffixarrayBWT.go#L8 // It's 4X faster! // // benchmark old ns/op new ns/op delta // BenchmarkTransform-16 1339346 310706 -76.80% // // benchmark old allocs new allocs delta // BenchmarkTransform-16 4 5 +25.00% // // benchmark old bytes new bytes delta // BenchmarkTransform-16 55362 79962 +44.43% _sa := suffixarray.New(s) tmp := reflect.ValueOf(_sa).Elem().FieldByName("sa").FieldByIndex([]int{0}) var sa []int = make([]int, len(s)+1) sa[0] = len(s) for i := 0; i < len(s); i++ { sa[i+1] = int(tmp.Index(i).Int()) } return sa } // ErrInvalidSuffixArray means length of sa is not equal to 1+len(s) var ErrInvalidSuffixArray = errors.New("bwt: invalid suffix array") // FromSuffixArray compute BWT from sa func FromSuffixArray(s []byte, sa []int, es byte) ([]byte, error) { if len(s) == 0 { return nil, ErrEmptySeq } if len(s)+1 != len(sa) || sa[0] != len(s) { return nil, ErrInvalidSuffixArray } bwt := make([]byte, len(sa)) bwt[0] = s[len(s)-1] for i := 1; i < len(sa); i++ { if sa[i] == 0 { bwt[i] = es } else { bwt[i] = s[sa[i]-1] } } return bwt, nil } bwt-0.6.1/README.md0000664000175000017510000000104014277613324013126 0ustar nileshnilesh# bwt [![Go Reference](https://pkg.go.dev/badge/github.com/shenwei356/bwt.svg)](https://pkg.go.dev/github.com/shenwei356/bwt) [![Go Report Card](https://goreportcard.com/badge/github.com/shenwei356/bwt)](https://goreportcard.com/report/github.com/shenwei356/bwt) Burrows-Wheeler Transform and FM-index in golang ## Install This package is "go-gettable", just: go get -u github.com/shenwei356/bwt ## Licence Copyright (c) 2015-2021, Wei Shen (shenwei356@gmail.com) [MIT License](https://github.com/shenwei356/bwt/blob/master/LICENSE) bwt-0.6.1/LICENSE0000664000175000017510000000211114277613324012654 0ustar nileshnileshCopyright (c) 2015-2021 Wei Shen (shenwei356@gmail.com) The MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. bwt-0.6.1/.gitignore0000775000175000017510000000041114277613324013643 0ustar nileshnilesh# Compiled Object files, Static and Dynamic libs (Shared Objects) *.o *.a *.so # Folders _obj _test # Architecture specific extensions/prefixes *.[568vq] [568vq].out *.cgo1.go *.cgo2.c _cgo_defun.c _cgo_gotypes.go _cgo_export.* _testmain.go *.exe *.directory