pax_global_header00006660000000000000000000000064151263155650014523gustar00rootroot0000000000000052 comment=81021786593092a00a3f78e5c4f98ce283748c0c golang-github-clipperhouse-stringish-0.1.1+ds/000077500000000000000000000000001512631556500213635ustar00rootroot00000000000000golang-github-clipperhouse-stringish-0.1.1+ds/.github/000077500000000000000000000000001512631556500227235ustar00rootroot00000000000000golang-github-clipperhouse-stringish-0.1.1+ds/.github/workflows/000077500000000000000000000000001512631556500247605ustar00rootroot00000000000000golang-github-clipperhouse-stringish-0.1.1+ds/.github/workflows/gotest.yml000066400000000000000000000007661512631556500270210ustar00rootroot00000000000000name: Test on: push: branches: [ main ] pull_request: branches: [ main ] jobs: all: runs-on: ubuntu-latest strategy: matrix: go-version: ['1.18', '1.19', '1.20', '1.21', '1.22', '1.23', '1.24', '1.25'] steps: - name: Check out code uses: actions/checkout@v4 - name: Set up Go uses: actions/setup-go@v5 with: go-version: ${{ matrix.go-version }} cache: true - name: Run test run: go test ./... -race -short golang-github-clipperhouse-stringish-0.1.1+ds/.gitignore000066400000000000000000000000211512631556500233440ustar00rootroot00000000000000.DS_Store *.test golang-github-clipperhouse-stringish-0.1.1+ds/LICENSE000066400000000000000000000020551512631556500223720ustar00rootroot00000000000000MIT License Copyright (c) 2025 Matt Sherman Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. golang-github-clipperhouse-stringish-0.1.1+ds/README.md000066400000000000000000000040061512631556500226420ustar00rootroot00000000000000# stringish A small Go module that provides a generic type constraint for “string-like” data, and a utf8 package that works with both strings and byte slices without conversions. ```go type Interface interface { ~[]byte | ~string } ``` [![Go Reference](https://pkg.go.dev/badge/github.com/clipperhouse/stringish/utf8.svg)](https://pkg.go.dev/github.com/clipperhouse/stringish/utf8) [![Test Status](https://github.com/clipperhouse/stringish/actions/workflows/gotest.yml/badge.svg)](https://github.com/clipperhouse/stringish/actions/workflows/gotest.yml) ## Install ``` go get github.com/clipperhouse/stringish ``` ## Examples ```go import ( "github.com/clipperhouse/stringish" "github.com/clipperhouse/stringish/utf8" ) s := "Hello, 世界" r, size := utf8.DecodeRune(s) // not DecodeRuneInString 🎉 b := []byte("Hello, 世界") r, size = utf8.DecodeRune(b) // same API! func MyFoo[T stringish.Interface](s T) T { // pass a string or a []byte // iterate, slice, transform, whatever } ``` ## Motivation Sometimes we want APIs to accept `string` or `[]byte` without having to convert between those types. That conversion usually allocates! By implementing with `stringish.Interface`, we can have a single API, and single implementation for both types: one `Foo` instead of `Foo` and `FooString`. We have converted the [`unicode/utf8` package](https://github.com/clipperhouse/stringish/blob/main/utf8/utf8.go) as an example -- note the absence of`*InString` funcs. We might look at `x/text` next. ## Used by - clipperhouse/uax29: [stringish trie](https://github.com/clipperhouse/uax29/blob/master/graphemes/trie.go#L27), [stringish iterator](https://github.com/clipperhouse/uax29/blob/master/internal/iterators/iterator.go#L9), [stringish SplitFunc](https://github.com/clipperhouse/uax29/blob/master/graphemes/splitfunc.go#L21) - [clipperhouse/displaywidth](https://github.com/clipperhouse/displaywidth) ## Prior discussion - [Consideration of similar by the Go team](https://github.com/golang/go/issues/48643) golang-github-clipperhouse-stringish-0.1.1+ds/go.mod000066400000000000000000000000621512631556500224670ustar00rootroot00000000000000module github.com/clipperhouse/stringish go 1.18 golang-github-clipperhouse-stringish-0.1.1+ds/interface.go000066400000000000000000000001031512631556500236440ustar00rootroot00000000000000package stringish type Interface interface { ~[]byte | ~string } golang-github-clipperhouse-stringish-0.1.1+ds/utf8/000077500000000000000000000000001512631556500222515ustar00rootroot00000000000000golang-github-clipperhouse-stringish-0.1.1+ds/utf8/LICENSE000066400000000000000000000026551512631556500232660ustar00rootroot00000000000000Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. golang-github-clipperhouse-stringish-0.1.1+ds/utf8/example_test.go000066400000000000000000000050201512631556500252670ustar00rootroot00000000000000// Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package utf8_test import ( "fmt" "github.com/clipperhouse/stringish/utf8" ) func ExampleDecodeLastRune() { s := "Hello, 世界" for len(s) > 0 { r, size := utf8.DecodeLastRune(s) fmt.Printf("%c %v\n", r, size) s = s[:len(s)-size] } // Output: // 界 3 // 世 3 // 1 // , 1 // o 1 // l 1 // l 1 // e 1 // H 1 } func ExampleDecodeRune() { s := "Hello, 世界" for len(s) > 0 { r, size := utf8.DecodeRune(s) fmt.Printf("%c %v\n", r, size) s = s[size:] } // Output: // H 1 // e 1 // l 1 // l 1 // o 1 // , 1 // 1 // 世 3 // 界 3 } func ExampleEncodeRune() { r := '世' buf := make([]byte, 3) n := utf8.EncodeRune(buf, r) fmt.Println(buf) fmt.Println(n) // Output: // [228 184 150] // 3 } func ExampleEncodeRune_outOfRange() { runes := []rune{ // Less than 0, out of range. -1, // Greater than 0x10FFFF, out of range. 0x110000, // The Unicode replacement character. utf8.RuneError, } for i, c := range runes { buf := make([]byte, 3) size := utf8.EncodeRune(buf, c) fmt.Printf("%d: %d %[2]s %d\n", i, buf, size) } // Output: // 0: [239 191 189] � 3 // 1: [239 191 189] � 3 // 2: [239 191 189] � 3 } func ExampleFullRune() { s := string([]byte{228, 184, 150}) // 世 fmt.Println(utf8.FullRune(s)) fmt.Println(utf8.FullRune(s[:2])) // Output: // true // false } func ExampleRuneCount() { s := []byte("Hello, 世界") fmt.Println("bytes =", len(s)) fmt.Println("runes =", utf8.RuneCount(s)) // Output: // bytes = 13 // runes = 9 } func ExampleRuneLen() { fmt.Println(utf8.RuneLen('a')) fmt.Println(utf8.RuneLen('界')) // Output: // 1 // 3 } func ExampleRuneStart() { s := "a界" fmt.Println(utf8.RuneStart(s[0])) fmt.Println(utf8.RuneStart(s[1])) fmt.Println(utf8.RuneStart(s[2])) // Output: // true // true // false } func ExampleValid() { valid := "Hello, 世界" invalid := string([]byte{0xff, 0xfe, 0xfd}) fmt.Println(utf8.Valid(valid)) fmt.Println(utf8.Valid(invalid)) // Output: // true // false } func ExampleValidRune() { valid := 'a' invalid := rune(0xfffffff) fmt.Println(utf8.ValidRune(valid)) fmt.Println(utf8.ValidRune(invalid)) // Output: // true // false } func ExampleAppendRune() { buf1 := utf8.AppendRune(nil, 0x10000) buf2 := utf8.AppendRune([]byte("init"), 0x10000) fmt.Println(string(buf1)) fmt.Println(string(buf2)) // Output: // 𐀀 // init𐀀 } golang-github-clipperhouse-stringish-0.1.1+ds/utf8/utf8.go000066400000000000000000000316471512631556500235010ustar00rootroot00000000000000// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This file is a modified version of unicode/utf8.go from the Go standard library. // Modifications are licensed under the MIT License. // // MIT License for modifications: // Copyright (c) 2025 Matt Sherman // Package utf8 implements functions and constants to support text encoded in // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences. // See https://en.wikipedia.org/wiki/UTF-8 package utf8 import "github.com/clipperhouse/stringish" // The conditions RuneError==unicode.ReplacementChar and // MaxRune==unicode.MaxRune are verified in the tests. // Defining them locally avoids this package depending on package unicode. // Numbers fundamental to the encoding. const ( RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character" RuneSelf = 0x80 // characters below RuneSelf are represented as themselves in a single byte. MaxRune = '\U0010FFFF' // Maximum valid Unicode code point. UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character. ) // Code points in the surrogate range are not valid for UTF-8. const ( surrogateMin = 0xD800 surrogateMax = 0xDFFF ) const ( t1 = 0b00000000 tx = 0b10000000 t2 = 0b11000000 t3 = 0b11100000 t4 = 0b11110000 t5 = 0b11111000 maskx = 0b00111111 mask2 = 0b00011111 mask3 = 0b00001111 mask4 = 0b00000111 rune1Max = 1<<7 - 1 rune2Max = 1<<11 - 1 rune3Max = 1<<16 - 1 // The default lowest and highest continuation byte. locb = 0b10000000 hicb = 0b10111111 // These names of these constants are chosen to give nice alignment in the // table below. The first nibble is an index into acceptRanges or F for // special one-byte cases. The second nibble is the Rune length or the // Status for the special one-byte case. xx = 0xF1 // invalid: size 1 as = 0xF0 // ASCII: size 1 s1 = 0x02 // accept 0, size 2 s2 = 0x13 // accept 1, size 3 s3 = 0x03 // accept 0, size 3 s4 = 0x23 // accept 2, size 3 s5 = 0x34 // accept 3, size 4 s6 = 0x04 // accept 0, size 4 s7 = 0x44 // accept 4, size 4 ) const ( runeErrorByte0 = t3 | (RuneError >> 12) runeErrorByte1 = tx | (RuneError>>6)&maskx runeErrorByte2 = tx | RuneError&maskx ) // first is information about the first byte in a UTF-8 sequence. var first = [256]uint8{ // 1 2 3 4 5 6 7 8 9 A B C D E F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F // 1 2 3 4 5 6 7 8 9 A B C D E F xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF } // acceptRange gives the range of valid values for the second byte in a UTF-8 // sequence. type acceptRange struct { lo uint8 // lowest value for second byte. hi uint8 // highest value for second byte. } // acceptRanges has size 16 to avoid bounds checks in the code that uses it. var acceptRanges = [16]acceptRange{ 0: {locb, hicb}, 1: {0xA0, hicb}, 2: {locb, 0x9F}, 3: {0x90, hicb}, 4: {locb, 0x8F}, } // FullRune reports whether s begins with a full UTF-8 encoding of a rune. // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune. func FullRune[T stringish.Interface](s T) bool { n := len(s) if n == 0 { return false } b0 := s[0] x := first[b0] if n >= int(x&7) { return true // ASCII, invalid or valid. } // Must be short or invalid. accept := acceptRanges[x>>4] if n > 1 { b1 := s[1] if b1 < accept.lo || accept.hi < b1 { return true } } if n > 2 { b2 := s[2] if b2 < locb || hicb < b2 { return true } } return false } // DecodeRune unpacks the first UTF-8 encoding in data and returns the rune and // its width in bytes. If data is empty it returns ([RuneError], 0). Otherwise, if // the encoding is invalid, it returns (RuneError, 1). Both are impossible // results for correct, non-empty UTF-8. // // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is // out of range, or is not the shortest possible UTF-8 encoding for the // value. No other validation is performed. func DecodeRune[T stringish.Interface](s T) (r rune, size int) { n := len(s) if n < 1 { return RuneError, 0 } b0 := s[0] x := first[b0] if x >= as { // The following code simulates an additional check for x == xx and // handling the ASCII and invalid cases accordingly. This mask-and-or // approach prevents an additional branch. mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF. return rune(b0)&^mask | RuneError&mask, 1 } sz := int(x & 7) accept := acceptRanges[x>>4] if n < sz { return RuneError, 1 } var b1, b2, b3 byte if sz > 1 { b1 = s[1] } if sz > 2 { b2 = s[2] } if sz > 3 { b3 = s[3] } if b1 < accept.lo || accept.hi < b1 { return RuneError, 1 } if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks return rune(b0&mask2)<<6 | rune(b1&maskx), 2 } if b2 < locb || hicb < b2 { return RuneError, 1 } if sz <= 3 { return rune(b0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3 } if b3 < locb || hicb < b3 { return RuneError, 1 } return rune(b0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4 } // DecodeLastRune unpacks the last UTF-8 encoding in data and returns the rune and // its width in bytes. If data is empty it returns ([RuneError], 0). Otherwise, if // the encoding is invalid, it returns (RuneError, 1). Both are impossible // results for correct, non-empty UTF-8. // // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is // out of range, or is not the shortest possible UTF-8 encoding for the // value. No other validation is performed. func DecodeLastRune[T stringish.Interface](s T) (r rune, size int) { end := len(s) if end == 0 { return RuneError, 0 } start := end - 1 lastByte := s[start] r = rune(lastByte) if r < RuneSelf { return r, 1 } // guard against O(n^2) behavior when traversing // backwards through strings with long sequences of // invalid UTF-8. lim := end - UTFMax if lim < 0 { lim = 0 } for start--; start >= lim; start-- { b := s[start] if RuneStart(b) { break } } if start < 0 { start = 0 } r, size = DecodeRune(s[start:end]) if start+size != end { return RuneError, 1 } return r, size } // RuneLen returns the number of bytes in the UTF-8 encoding of the rune. // It returns -1 if the rune is not a valid value to encode in UTF-8. func RuneLen(r rune) int { switch { case r < 0: return -1 case r <= rune1Max: return 1 case r <= rune2Max: return 2 case surrogateMin <= r && r <= surrogateMax: return -1 case r <= rune3Max: return 3 case r <= MaxRune: return 4 } return -1 } // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune. // If the rune is out of range, it writes the encoding of [RuneError]. // It returns the number of bytes written. func EncodeRune(p []byte, r rune) int { // This function is inlineable for fast handling of ASCII. if uint32(r) <= rune1Max { p[0] = byte(r) return 1 } return encodeRuneNonASCII(p, r) } func encodeRuneNonASCII(p []byte, r rune) int { // Negative values are erroneous. Making it unsigned addresses the problem. switch i := uint32(r); { case i <= rune2Max: _ = p[1] // eliminate bounds checks p[0] = t2 | byte(r>>6) p[1] = tx | byte(r)&maskx return 2 case i < surrogateMin, surrogateMax < i && i <= rune3Max: _ = p[2] // eliminate bounds checks p[0] = t3 | byte(r>>12) p[1] = tx | byte(r>>6)&maskx p[2] = tx | byte(r)&maskx return 3 case i > rune3Max && i <= MaxRune: _ = p[3] // eliminate bounds checks p[0] = t4 | byte(r>>18) p[1] = tx | byte(r>>12)&maskx p[2] = tx | byte(r>>6)&maskx p[3] = tx | byte(r)&maskx return 4 default: _ = p[2] // eliminate bounds checks p[0] = runeErrorByte0 p[1] = runeErrorByte1 p[2] = runeErrorByte2 return 3 } } // AppendRune appends the UTF-8 encoding of r to the end of p and // returns the extended buffer. If the rune is out of range, // it appends the encoding of [RuneError]. func AppendRune(p []byte, r rune) []byte { // This function is inlineable for fast handling of ASCII. if uint32(r) <= rune1Max { return append(p, byte(r)) } return appendRuneNonASCII(p, r) } func appendRuneNonASCII(p []byte, r rune) []byte { // Negative values are erroneous. Making it unsigned addresses the problem. switch i := uint32(r); { case i <= rune2Max: return append(p, t2|byte(r>>6), tx|byte(r)&maskx) case i < surrogateMin, surrogateMax < i && i <= rune3Max: return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx) case i > rune3Max && i <= MaxRune: return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx) default: return append(p, runeErrorByte0, runeErrorByte1, runeErrorByte2) } } // RuneCount returns the number of runes in s. Erroneous and short // encodings are treated as single runes of width 1 byte. func RuneCount[T stringish.Interface](s T) int { n := len(s) var count int for i := 0; i < n; i++ { c := s[i] if c >= RuneSelf { _, size := DecodeRune(s[i:]) if size > 0 { count++ i += size - 1 // -1 because the loop will increment } else { count++ } } else { count++ } } return count } // RuneStart reports whether the byte could be the first byte of an encoded, // possibly invalid rune. Second and subsequent bytes always have the top two // bits set to 10. func RuneStart(b byte) bool { return b&0xC0 != 0x80 } // Valid reports whether s consists entirely of valid UTF-8-encoded runes. func Valid[T stringish.Interface](s T) bool { n := len(s) if n == 0 { return true } // Fast path. Check for and skip 8 bytes of ASCII characters per iteration. for i := 0; i+8 <= n; i += 8 { var bytes [8]byte copy(bytes[:], s[i:i+8]) // Combining two 32 bit loads allows the same code to be used // for 32 and 64 bit platforms. // The compiler can generate a 32bit load for first32 and second32 // on many platforms. See test/codegen/memcombine.go. first32 := uint32(bytes[0]) | uint32(bytes[1])<<8 | uint32(bytes[2])<<16 | uint32(bytes[3])<<24 second32 := uint32(bytes[4]) | uint32(bytes[5])<<8 | uint32(bytes[6])<<16 | uint32(bytes[7])<<24 if (first32|second32)&0x80808080 != 0 { // Found a non ASCII byte (>= RuneSelf). // Process remaining bytes individually for j := i; j < n; { b := s[j] if b < RuneSelf { j++ continue } x := first[b] if x == xx { return false // Illegal starter byte. } size := int(x & 7) if j+size > n { return false // Short or invalid. } accept := acceptRanges[x>>4] if j+1 < n { c := s[j+1] if c < accept.lo || accept.hi < c { return false } } if size >= 3 && j+2 < n { c := s[j+2] if c < locb || hicb < c { return false } } if size == 4 && j+3 < n { c := s[j+3] if c < locb || hicb < c { return false } } j += size } return true } } // Process remaining bytes individually for i := 0; i < n; { b := s[i] if b < RuneSelf { i++ continue } x := first[b] if x == xx { return false // Illegal starter byte. } size := int(x & 7) if i+size > n { return false // Short or invalid. } accept := acceptRanges[x>>4] if i+1 < n { c := s[i+1] if c < accept.lo || accept.hi < c { return false } } if size >= 3 && i+2 < n { c := s[i+2] if c < locb || hicb < c { return false } } if size == 4 && i+3 < n { c := s[i+3] if c < locb || hicb < c { return false } } i += size } return true } // ValidRune reports whether r can be legally encoded as UTF-8. // Code points that are out of range or a surrogate half are illegal. func ValidRune(r rune) bool { switch { case 0 <= r && r < surrogateMin: return true case surrogateMax < r && r <= MaxRune: return true } return false } golang-github-clipperhouse-stringish-0.1.1+ds/utf8/utf8_test.go000066400000000000000000000431141512631556500245300ustar00rootroot00000000000000// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package utf8_test import ( "bytes" "strings" "testing" "unicode" . "github.com/clipperhouse/stringish/utf8" ) // Validate the constants redefined from unicode. func TestConstants(t *testing.T) { if MaxRune != unicode.MaxRune { t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune) } if RuneError != unicode.ReplacementChar { t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar) } } type Utf8Map struct { r rune str string } var utf8map = []Utf8Map{ {0x0000, "\x00"}, {0x0001, "\x01"}, {0x007e, "\x7e"}, {0x007f, "\x7f"}, {0x0080, "\xc2\x80"}, {0x0081, "\xc2\x81"}, {0x00bf, "\xc2\xbf"}, {0x00c0, "\xc3\x80"}, {0x00c1, "\xc3\x81"}, {0x00c8, "\xc3\x88"}, {0x00d0, "\xc3\x90"}, {0x00e0, "\xc3\xa0"}, {0x00f0, "\xc3\xb0"}, {0x00f8, "\xc3\xb8"}, {0x00ff, "\xc3\xbf"}, {0x0100, "\xc4\x80"}, {0x07ff, "\xdf\xbf"}, {0x0400, "\xd0\x80"}, {0x0800, "\xe0\xa0\x80"}, {0x0801, "\xe0\xa0\x81"}, {0x1000, "\xe1\x80\x80"}, {0xd000, "\xed\x80\x80"}, {0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half. {0xe000, "\xee\x80\x80"}, // first code point after surrogate half. {0xfffe, "\xef\xbf\xbe"}, {0xffff, "\xef\xbf\xbf"}, {0x10000, "\xf0\x90\x80\x80"}, {0x10001, "\xf0\x90\x80\x81"}, {0x40000, "\xf1\x80\x80\x80"}, {0x10fffe, "\xf4\x8f\xbf\xbe"}, {0x10ffff, "\xf4\x8f\xbf\xbf"}, {0xFFFD, "\xef\xbf\xbd"}, } var surrogateMap = []Utf8Map{ {0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1) {0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1) } var testStrings = []string{ "", "abcd", "☺☻☹", "日a本b語ç日ð本Ê語þ日¥本¼語i日©", "日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©", "\x80\x80\x80\x80", } func TestFullRune(t *testing.T) { for _, m := range utf8map { b := []byte(m.str) if !FullRune(b) { t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r) } s := m.str if !FullRune(s) { t.Errorf("FullRune(%q) (%U) = false, want true", s, m.r) } b1 := b[0 : len(b)-1] if FullRune(b1) { t.Errorf("FullRune(%q) = true, want false", b1) } s1 := string(b1) if FullRune(s1) { t.Errorf("FullRune(%q) = true, want false", s1) } } for _, s := range []string{"\xc0", "\xc1"} { b := []byte(s) if !FullRune(b) { t.Errorf("FullRune(%q) = false, want true", s) } if !FullRune(s) { t.Errorf("FullRune(%q) = false, want true", s) } } } func TestEncodeRune(t *testing.T) { for _, m := range utf8map { b := []byte(m.str) var buf [10]byte n := EncodeRune(buf[0:], m.r) b1 := buf[0:n] if !bytes.Equal(b, b1) { t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b) } } } func TestAppendRune(t *testing.T) { for _, m := range utf8map { if buf := AppendRune(nil, m.r); string(buf) != m.str { t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str) } if buf := AppendRune([]byte("init"), m.r); string(buf) != "init"+m.str { t.Errorf("AppendRune(init, %#04x) = %s, want %s", m.r, buf, "init"+m.str) } } } func TestDecodeRune(t *testing.T) { for _, m := range utf8map { b := []byte(m.str) r, size := DecodeRune(b) if r != m.r || size != len(b) { t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b)) } s := m.str r, size = DecodeRune(s) if r != m.r || size != len(b) { t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b)) } // there's an extra byte that bytes left behind - make sure trailing byte works r, size = DecodeRune(b[0:cap(b)]) if r != m.r || size != len(b) { t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b)) } s = m.str + "\x00" r, size = DecodeRune(s) if r != m.r || size != len(b) { t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b)) } // make sure missing bytes fail wantsize := 1 if wantsize >= len(b) { wantsize = 0 } r, size = DecodeRune(b[0 : len(b)-1]) if r != RuneError || size != wantsize { t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[:len(b)-1], r, size, RuneError, wantsize) } s = m.str[0 : len(m.str)-1] r, size = DecodeRune(s) if r != RuneError || size != wantsize { t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize) } // make sure bad sequences fail if len(b) == 1 { b[0] = 0x80 } else { b[len(b)-1] = 0x7F } r, size = DecodeRune(b) if r != RuneError || size != 1 { t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1) } s = string(b) r, size = DecodeRune(s) if r != RuneError || size != 1 { t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1) } } } func TestDecodeSurrogateRune(t *testing.T) { for _, m := range surrogateMap { b := []byte(m.str) r, size := DecodeRune(b) if r != RuneError || size != 1 { t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1) } s := m.str r, size = DecodeRune(s) if r != RuneError || size != 1 { t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", s, r, size, RuneError, 1) } } } // Check that DecodeRune and DecodeLastRune correspond to // the equivalent range loop. func TestSequencing(t *testing.T) { for _, ts := range testStrings { for _, m := range utf8map { for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} { testSequence(t, s) } } } } func runtimeRuneCount(s string) int { return len([]rune(s)) // Replaced by gc with call to runtime.countrunes(s). } // Check that a range loop, len([]rune(string)) optimization and // []rune conversions visit the same runes. // Not really a test of this package, but the assumption is used here and // it's good to verify. func TestRuntimeConversion(t *testing.T) { for _, ts := range testStrings { count := RuneCount(ts) if n := runtimeRuneCount(ts); n != count { t.Errorf("%q: len([]rune()) counted %d runes; got %d from RuneCount", ts, n, count) break } runes := []rune(ts) if n := len(runes); n != count { t.Errorf("%q: []rune() has length %d; got %d from RuneCount", ts, n, count) break } i := 0 for _, r := range ts { if r != runes[i] { t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r) } i++ } } } var invalidSequenceTests = []string{ "\xed\xa0\x80\x80", // surrogate min "\xed\xbf\xbf\x80", // surrogate max // xx "\x91\x80\x80\x80", // s1 "\xC2\x7F\x80\x80", "\xC2\xC0\x80\x80", "\xDF\x7F\x80\x80", "\xDF\xC0\x80\x80", // s2 "\xE0\x9F\xBF\x80", "\xE0\xA0\x7F\x80", "\xE0\xBF\xC0\x80", "\xE0\xC0\x80\x80", // s3 "\xE1\x7F\xBF\x80", "\xE1\x80\x7F\x80", "\xE1\xBF\xC0\x80", "\xE1\xC0\x80\x80", //s4 "\xED\x7F\xBF\x80", "\xED\x80\x7F\x80", "\xED\x9F\xC0\x80", "\xED\xA0\x80\x80", // s5 "\xF0\x8F\xBF\xBF", "\xF0\x90\x7F\xBF", "\xF0\x90\x80\x7F", "\xF0\xBF\xBF\xC0", "\xF0\xBF\xC0\x80", "\xF0\xC0\x80\x80", // s6 "\xF1\x7F\xBF\xBF", "\xF1\x80\x7F\xBF", "\xF1\x80\x80\x7F", "\xF1\xBF\xBF\xC0", "\xF1\xBF\xC0\x80", "\xF1\xC0\x80\x80", // s7 "\xF4\x7F\xBF\xBF", "\xF4\x80\x7F\xBF", "\xF4\x80\x80\x7F", "\xF4\x8F\xBF\xC0", "\xF4\x8F\xC0\x80", "\xF4\x90\x80\x80", } func runtimeDecodeRune(s string) rune { for _, r := range s { return r } return -1 } func TestDecodeInvalidSequence(t *testing.T) { for _, s := range invalidSequenceTests { r1, _ := DecodeRune([]byte(s)) if want := RuneError; r1 != want { t.Errorf("DecodeRune(%#x) = %#04x, want %#04x", s, r1, want) return } r2, _ := DecodeRune(s) if want := RuneError; r2 != want { t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s, r2, want) return } if r1 != r2 { t.Errorf("DecodeRune(%#x) = %#04x mismatch with DecodeRune(%q) = %#04x", s, r1, s, r2) return } r3 := runtimeDecodeRune(s) if r2 != r3 { t.Errorf("DecodeRune(%q) = %#04x mismatch with runtime.decoderune(%q) = %#04x", s, r2, s, r3) return } } } func testSequence(t *testing.T, s string) { type info struct { index int r rune } index := make([]info, len(s)) b := []byte(s) si := 0 j := 0 for i, r := range s { if si != i { t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i) return } index[j] = info{i, r} j++ r1, size1 := DecodeRune(b[i:]) if r != r1 { t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r) return } r2, size2 := DecodeRune(s[i:]) if r != r2 { t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r2, r) return } if size1 != size2 { t.Errorf("DecodeRune/DecodeRune(%q) size mismatch %d/%d", s[i:], size1, size2) return } si += size1 } j-- for si = len(s); si > 0; { r1, size1 := DecodeLastRune(b[0:si]) r2, size2 := DecodeLastRune(s[0:si]) if size1 != size2 { t.Errorf("DecodeLastRune/DecodeLastRune(%q, %d) size mismatch %d/%d", s, si, size1, size2) return } if r1 != index[j].r { t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r) return } if r2 != index[j].r { t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r) return } si -= size1 if si != index[j].index { t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index) return } j-- } if si != 0 { t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si) } } // Check that negative runes encode as U+FFFD. func TestNegativeRune(t *testing.T) { errorbuf := make([]byte, UTFMax) errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)] buf := make([]byte, UTFMax) buf = buf[0:EncodeRune(buf, -1)] if !bytes.Equal(buf, errorbuf) { t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf) } } type RuneCountTest struct { in string out int } var runecounttests = []RuneCountTest{ {"abcd", 4}, {"☺☻☹", 3}, {"1,2,3,4", 7}, {"\xe2\x00", 2}, {"\xe2\x80", 2}, {"a\xe2\x80", 3}, } func TestRuneCount(t *testing.T) { for _, tt := range runecounttests { if out := RuneCount(tt.in); out != tt.out { t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out) } if out := RuneCount([]byte(tt.in)); out != tt.out { t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out) } } } func TestRuneCountNonASCIIAllocation(t *testing.T) { if n := testing.AllocsPerRun(10, func() { s := []byte("日本語日本語日本語日") _ = RuneCount(s) }); n > 0 { t.Errorf("unexpected RuneCount allocation, got %v, want 0", n) } } type RuneLenTest struct { r rune size int } var runelentests = []RuneLenTest{ {0, 1}, {'e', 1}, {'é', 2}, {'☺', 3}, {RuneError, 3}, {MaxRune, 4}, {0xD800, -1}, {0xDFFF, -1}, {MaxRune + 1, -1}, {-1, -1}, } func TestRuneLen(t *testing.T) { for _, tt := range runelentests { if size := RuneLen(tt.r); size != tt.size { t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size) } } } type ValidTest struct { in string out bool } var validTests = []ValidTest{ {"", true}, {"a", true}, {"abc", true}, {"Ж", true}, {"ЖЖ", true}, {"брэд-ЛГТМ", true}, {"☺☻☹", true}, {"aa\xe2", false}, {string([]byte{66, 250}), false}, {string([]byte{66, 250, 67}), false}, {"a\uFFFDb", true}, {string("\xF4\x8F\xBF\xBF"), true}, // U+10FFFF {string("\xF4\x90\x80\x80"), false}, // U+10FFFF+1; out of range {string("\xF7\xBF\xBF\xBF"), false}, // 0x1FFFFF; out of range {string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range {string("\xc0\x80"), false}, // U+0000 encoded in two bytes: incorrect {string("\xed\xa0\x80"), false}, // U+D800 high surrogate (sic) {string("\xed\xbf\xbf"), false}, // U+DFFF low surrogate (sic) } func TestValid(t *testing.T) { for _, tt := range validTests { if Valid([]byte(tt.in)) != tt.out { t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out) } if Valid(tt.in) != tt.out { t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out) } } } type ValidRuneTest struct { r rune ok bool } var validrunetests = []ValidRuneTest{ {0, true}, {'e', true}, {'é', true}, {'☺', true}, {RuneError, true}, {MaxRune, true}, {0xD7FF, true}, {0xD800, false}, {0xDFFF, false}, {0xE000, true}, {MaxRune + 1, false}, {-1, false}, } func TestValidRune(t *testing.T) { for _, tt := range validrunetests { if ok := ValidRune(tt.r); ok != tt.ok { t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok) } } } func BenchmarkRuneCountTenASCIIChars(b *testing.B) { s := []byte("0123456789") for i := 0; i < b.N; i++ { RuneCount(s) } } func BenchmarkRuneCountTenJapaneseChars(b *testing.B) { s := []byte("日本語日本語日本語日") for i := 0; i < b.N; i++ { RuneCount(s) } } func BenchmarkRuneCountStringTenASCIIChars(b *testing.B) { for i := 0; i < b.N; i++ { RuneCount("0123456789") } } func BenchmarkRuneCountStringTenJapaneseChars(b *testing.B) { for i := 0; i < b.N; i++ { RuneCount("日本語日本語日本語日") } } var ascii100000 = strings.Repeat("0123456789", 10000) func BenchmarkValidTenASCIIChars(b *testing.B) { s := []byte("0123456789") for i := 0; i < b.N; i++ { Valid(s) } } func BenchmarkValid100KASCIIChars(b *testing.B) { s := []byte(ascii100000) for i := 0; i < b.N; i++ { Valid(s) } } func BenchmarkValidTenJapaneseChars(b *testing.B) { s := []byte("日本語日本語日本語日") for i := 0; i < b.N; i++ { Valid(s) } } func BenchmarkValidLongMostlyASCII(b *testing.B) { longMostlyASCII := []byte(longStringMostlyASCII) for i := 0; i < b.N; i++ { Valid(longMostlyASCII) } } func BenchmarkValidLongJapanese(b *testing.B) { longJapanese := []byte(longStringJapanese) for i := 0; i < b.N; i++ { Valid(longJapanese) } } func BenchmarkValidStringTenASCIIChars(b *testing.B) { for i := 0; i < b.N; i++ { Valid("0123456789") } } func BenchmarkValidString100KASCIIChars(b *testing.B) { for i := 0; i < b.N; i++ { Valid(ascii100000) } } func BenchmarkValidStringTenJapaneseChars(b *testing.B) { for i := 0; i < b.N; i++ { Valid("日本語日本語日本語日") } } func BenchmarkValidStringLongMostlyASCII(b *testing.B) { for i := 0; i < b.N; i++ { Valid(longStringMostlyASCII) } } func BenchmarkValidStringLongJapanese(b *testing.B) { for i := 0; i < b.N; i++ { Valid(longStringJapanese) } } var longStringMostlyASCII string // ~100KB, ~97% ASCII var longStringJapanese string // ~100KB, non-ASCII func init() { const japanese = "日本語日本語日本語日" var b strings.Builder for i := 0; b.Len() < 100_000; i++ { if i%100 == 0 { b.WriteString(japanese) } else { b.WriteString("0123456789") } } longStringMostlyASCII = b.String() longStringJapanese = strings.Repeat(japanese, 100_000/len(japanese)) } func BenchmarkEncodeASCIIRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { EncodeRune(buf, 'a') // 1 byte } } func BenchmarkEncodeSpanishRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { EncodeRune(buf, 'Ñ') // 2 bytes } } func BenchmarkEncodeJapaneseRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { EncodeRune(buf, '本') // 3 bytes } } func BenchmarkEncodeMaxRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { EncodeRune(buf, MaxRune) // 4 bytes } } func BenchmarkEncodeInvalidRuneMaxPlusOne(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { EncodeRune(buf, MaxRune+1) // 3 bytes: RuneError } } func BenchmarkEncodeInvalidRuneSurrogate(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { EncodeRune(buf, 0xD800) // 3 bytes: RuneError } } func BenchmarkEncodeInvalidRuneNegative(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { EncodeRune(buf, -1) // 3 bytes: RuneError } } func BenchmarkAppendASCIIRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { AppendRune(buf[:0], 'a') // 1 byte } } func BenchmarkAppendSpanishRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { AppendRune(buf[:0], 'Ñ') // 2 bytes } } func BenchmarkAppendJapaneseRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { AppendRune(buf[:0], '本') // 3 bytes } } func BenchmarkAppendMaxRune(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { AppendRune(buf[:0], MaxRune) // 4 bytes } } func BenchmarkAppendInvalidRuneMaxPlusOne(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { AppendRune(buf[:0], MaxRune+1) // 3 bytes: RuneError } } func BenchmarkAppendInvalidRuneSurrogate(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { AppendRune(buf[:0], 0xD800) // 3 bytes: RuneError } } func BenchmarkAppendInvalidRuneNegative(b *testing.B) { buf := make([]byte, UTFMax) for i := 0; i < b.N; i++ { AppendRune(buf[:0], -1) // 3 bytes: RuneError } } func BenchmarkDecodeASCIIRune(b *testing.B) { a := []byte{'a'} for i := 0; i < b.N; i++ { DecodeRune(a) } } func BenchmarkDecodeJapaneseRune(b *testing.B) { nihon := []byte("本") for i := 0; i < b.N; i++ { DecodeRune(nihon) } } // boolSink is used to reference the return value of benchmarked // functions to avoid dead code elimination. var boolSink bool func BenchmarkFullRune(b *testing.B) { benchmarks := []struct { name string data []byte }{ {"ASCII", []byte("a")}, {"Incomplete", []byte("\xf0\x90\x80")}, {"Japanese", []byte("本")}, } for _, bm := range benchmarks { b.Run(bm.name, func(b *testing.B) { for i := 0; i < b.N; i++ { boolSink = FullRune(bm.data) } }) } }