pax_global_header00006660000000000000000000000064145225257270014525gustar00rootroot0000000000000052 comment=1cfacc81a878d4a07b13f51f2368cd86893d23fa golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/000077500000000000000000000000001452252572700216305ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/.github/000077500000000000000000000000001452252572700231705ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/.github/workflows/000077500000000000000000000000001452252572700252255ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/.github/workflows/benchmark.yml000066400000000000000000000005531452252572700277050ustar00rootroot00000000000000name: Go on: push: branches: - 'main' jobs: benchmark: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Setup Go uses: actions/setup-go@v2 with: go-version: 1.18.x - name: Run benchmarks run: | go install golang.org/x/perf/cmd/benchstat@latest make benchcmp count=1 golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/.github/workflows/go.yml000066400000000000000000000017011452252572700263540ustar00rootroot00000000000000name: Go on: [pull_request] jobs: test: strategy: matrix: go: - 1.17.x - 1.18.x os: - [self-hosted, linux, arm64, segment] - ubuntu-latest runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 - name: Setup Go ${{ matrix.go }} uses: actions/setup-go@v2 with: go-version: ${{ matrix.go }} - name: Download Dependencies run: go mod download - name: Vet run: go vet ./... - name: Test run: go test -race -v ./... - name: Test (purego) run: go test -race -tags purego -v ./... gen: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Setup Go uses: actions/setup-go@v2 with: go-version: 1.18.x - name: Ensure generated assembler code is up to date run: | go install github.com/kevinburke/differ@1.2 differ make --always-make build golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/.gitignore000066400000000000000000000004311452252572700236160ustar00rootroot00000000000000# Binaries for programs and plugins *.exe *.exe~ *.dll *.so *.dylib # Test binary, built with `go test -c` *.test # Output of the go coverage tool, specifically when used with LiteIDE *.out # Dependency directories (remove the comment below to include it) # vendor/ # Emacs *~ golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/CODE_OF_CONDUCT.md000066400000000000000000000062631452252572700244360ustar00rootroot00000000000000# Contributor Covenant Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: - Using welcoming and inclusive language - Being respectful of differing viewpoints and experiences - Gracefully accepting constructive criticism - Focusing on what is best for the community - Showing empathy towards other community members Examples of unacceptable behavior by participants include: - The use of sexualized language or imagery and unwelcome sexual attention or advances - Trolling, insulting/derogatory comments, and personal or political attacks - Public or private harassment - Publishing others' private information, such as a physical or electronic address, without explicit permission - Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at open-source@twilio.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/CONTRIBUTING.md000066400000000000000000000015741452252572700240700ustar00rootroot00000000000000# Contributing to segmentio/asm ## Code of Conduct Help us keep the project open and inclusive. Please be kind to and considerate of other developers, as we all have the same goal: make the project as good as it can be. * [Code of Conduct](./CODE_OF_CONDUCT.md) ## Licensing All third party contributors acknowledge that any contributions they provide will be made under the same open source license that the open source project is provided under. ## Contributing * Open an Issue to report bugs or discuss non-trivial changes. * Open a Pull Request to submit a code change for review. ### Coding Rules To ensure consistency throughout the source code, keep these rules in mind when submitting contributions: * All features or bug fixes must be tested by one or more tests. * All exported types, functions, and symbols must be documented. * All code must be formatted with `go fmt`. golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/LICENSE000066400000000000000000000016061452252572700226400ustar00rootroot00000000000000MIT No Attribution Copyright 2023 Segment Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/Makefile000066400000000000000000000021231452252572700232660ustar00rootroot00000000000000SHELL = /bin/bash dstdir := $(CURDIR) srcdir := $(CURDIR)/build sources := $(wildcard \ $(srcdir)/*_asm.go \ $(srcdir)/*/*_asm.go \ ) targets := \ $(patsubst $(srcdir)/%_asm.go,$(dstdir)/%_amd64.s,$(sources)) \ $(patsubst $(srcdir)/%_asm.go,$(dstdir)/%_amd64.go,$(sources)) internal := $(wildcard $(srcdir)/internal/*/*.go) build: $(targets) count ?= 5 bench ?= . pkg ?= ... benchcmp: go test -v -run _ -count $(count) -bench $(bench) ./$(pkg) -tags purego > /tmp/bench-$(subst .,dot,$(pkg))-purego.txt go test -v -run _ -count $(count) -bench $(bench) ./$(pkg) > /tmp/bench-$(subst .,dot,$(pkg))-asm.txt benchstat /tmp/bench-$(subst .,dot,$(pkg))-{purego,asm}.txt $(dstdir)/%_amd64.s $(dstdir)/%_amd64.go: $(srcdir)/%_asm.go $(internal) cd build && go run $(patsubst $(CURDIR)/build/%,%,$<) \ -pkg $(notdir $(realpath $(dir $<))) \ -out ../$(patsubst $(CURDIR)/%,%,$(patsubst $(srcdir)/%_asm.go,$(dstdir)/%_amd64.s,$<)) \ -stubs ../$(patsubst $(CURDIR)/%,%,$(patsubst $(srcdir)/%_asm.go,$(dstdir)/%_amd64.go,$<)) go fmt $(dstdir)/$(*)_amd64.go .PHONY: build benchmp golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/README.md000066400000000000000000000143111452252572700231070ustar00rootroot00000000000000# asm ![build status](https://github.com/segmentio/asm/actions/workflows/go.yml/badge.svg) [![GoDoc](https://godoc.org/github.com/segmentio/asm?status.svg)](https://godoc.org/github.com/segmentio/asm) Go library providing algorithms that use the full power of modern CPUs to get the best performance. ## Motivation The cloud makes it easier than ever to access large scale compute capacity, and it's become common to run distributed systems deployed across dozens or sometimes hundreds of CPUs. Because projects run on so many cores now, program performance and efficiency matters more today than it has ever before. Modern CPUs are complex machines with performance characteristics that may vary by orders of magnitude depending on how they are used. Features like branch prediction, instruction reordering, pipelining, or caching are all input variables that determine the compute throughput that a CPU can achieve. While compilers keep being improved, and often employ micro-optimizations that would be counter-productive for human developers to be responsible for, there are limitations to what they can do, and Assembly still has a role to play in optimizing algorithms on hot code paths of large scale applications. SIMD instruction sets offer interesting opportunities for software engineers. Taking advantage of these instructions often requires rethinking how the program represents and manipulates data, which is beyond the realm of optimizations that can be implemented by a compiler. When renting CPU time from a Cloud provider, programs that fail to leverage the full sets of instructions available are therefore paying for features they do not use. This package aims to provide such algorithms, optimized to leverage advanced instruction sets of modern CPUs to maximize throughput and take the best advantage of the available compute power. Users of the package will find functions that have often been designed to work on **arrays of values**, which is where SIMD and branchless algorithms shine. The functions in this library have been used in high throughput production environments at Segment, we hope that they will be useful to other developers using Go in performance-sensitive software. ## Usage The library is composed of multiple Go packages intended to act as logical groups of functions sharing similar properties: | Package | Purpose | | ------- | ------- | | [ascii](ascii) | library of functions designed to work on ASCII inputs | | [base64](base64) | standard library compatible base64 encodings | | [bswap](bswap) | byte swapping algorithms working on arrays of fixed-size items | | [cpu](cpu) | definition of the ABI used to detect CPU features | | [mem](mem) | functions operating on byte arrays | | [qsort](qsort) | quick-sort implementations for arrays of fixed-size items | | [slices](slices) | functions performing computations on pairs of slices | | [sortedset](sortedset) | functions working on sorted arrays of fixed-size items | When no assembly version of a function is available for the target platform, the package provides a generic implementation in Go which is automatically picked up by the compiler. ## Showcase The purpose of this library being to improve the runtime efficiency of Go programs, we compiled a few snapshots of benchmark runs to showcase the kind of improvements that these code paths can expect from leveraging SIMD and branchless optimizations: ``` goos: darwin goarch: amd64 cpu: Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz ``` ``` pkg: github.com/segmentio/asm/ascii name old time/op new time/op delta EqualFoldString/0512 276ns ± 1% 21ns ± 2% -92.50% (p=0.008 n=5+5) name old speed new speed delta EqualFoldString/0512 3.71GB/s ± 1% 49.44GB/s ± 2% +1232.79% (p=0.008 n=5+5) ``` ``` pkg: github.com/segmentio/asm/bswap name old time/op new time/op delta Swap64 11.2µs ± 1% 0.9µs ± 9% -92.06% (p=0.008 n=5+5) name old speed new speed delta Swap64 5.83GB/s ± 1% 73.67GB/s ± 9% +1162.98% (p=0.008 n=5+5) ``` ``` pkg: github.com/segmentio/asm/qsort name old time/op new time/op delta Sort16/1000000 269ms ± 2% 46ms ± 3% -83.08% (p=0.008 n=5+5) name old speed new speed delta Sort16/1000000 59.4MB/s ± 2% 351.2MB/s ± 3% +491.24% (p=0.008 n=5+5) ``` ## Maintenance The assembly code is generated with [AVO](https://github.com/mmcloughlin/avo), and orchestrated by a Makefile which helps maintainers rebuild the assembly source code when the AVO files are modified. The repository contains two Go modules; the main module is declared as `github.com/segmentio/asm` at the root of the repository, and the second module is found in the `build` subdirectory. The `build` module is used to isolate build dependencies from programs that import the main module. Through this mechanism, AVO does not become a dependency of programs using `github.com/segmentio/asm`, keeping the dependency management overhead minimal for the users, and allowing maintainers to make modifications to the `build` package. Versioning of the two modules is managed independently; while we aim to provide stable APIs on the main package, breaking changes may be introduced on the `build` package more often, as it is intended to be ground for more experimental constructs in the project. ### Requirements Some libraries have custom purpose code for both amd64 and arm64. Others (qsort) have only amd64. Search for a `.s` file matching your architecture to be sure you are using the assembler optimized library instructions. The Go code requires Go 1.17 or above. These versions contain significant performance improvements compared to previous Go versions. `asm` version v1.1.5 and earlier maintain compatibility with Go 1.16. ### purego Programs in the `build` module should add the following declaration: ```go func init() { ConstraintExpr("!purego") } ``` It instructs AVO to inject the `!purego` tag in the generated files, allowing the libraries to be compiled without any assembly optimizations with a build command such as: ``` go build -tags purego ... ``` This is mainly useful to compare the impact of using the assembly optimized versions instead of the simpler Go-only implementations. golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/000077500000000000000000000000001452252572700227205ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/ascii.go000066400000000000000000000046471452252572700243520ustar00rootroot00000000000000package ascii import _ "github.com/segmentio/asm/cpu" // https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord const ( hasLessConstL64 = (^uint64(0)) / 255 hasLessConstR64 = hasLessConstL64 * 128 hasLessConstL32 = (^uint32(0)) / 255 hasLessConstR32 = hasLessConstL32 * 128 hasMoreConstL64 = (^uint64(0)) / 255 hasMoreConstR64 = hasMoreConstL64 * 128 hasMoreConstL32 = (^uint32(0)) / 255 hasMoreConstR32 = hasMoreConstL32 * 128 ) func hasLess64(x, n uint64) bool { return ((x - (hasLessConstL64 * n)) & ^x & hasLessConstR64) != 0 } func hasLess32(x, n uint32) bool { return ((x - (hasLessConstL32 * n)) & ^x & hasLessConstR32) != 0 } func hasMore64(x, n uint64) bool { return (((x + (hasMoreConstL64 * (127 - n))) | x) & hasMoreConstR64) != 0 } func hasMore32(x, n uint32) bool { return (((x + (hasMoreConstL32 * (127 - n))) | x) & hasMoreConstR32) != 0 } var lowerCase = [256]byte{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/ascii_test.go000066400000000000000000000162731452252572700254070ustar00rootroot00000000000000package ascii import ( "fmt" "math/rand" "strings" "testing" "unicode/utf8" "github.com/segmentio/asm/internal/buffer" ) var testStrings = [...]string{ "", "a", "ab", "abc", "abcd", "hello", "Hello World!", "Hello\"World!", "Hello\\World!", "Hello\nWorld!", "Hello\rWorld!", "Hello\tWorld!", "Hello\bWorld!", "Hello\fWorld!", "H~llo World!", "H~llo", "你好", "~", "\x80", "\x7F", "\xFF", "a string of 16B.", "an invalid string of 32B. \x00......", "some kind of long string with only ascii characters.", "some kind of long string with a non-ascii character at the end.\xff", strings.Repeat("1234567890", 1000), } var testStringsUTF8 []string func init() { for _, test := range testStrings { if utf8.ValidString(test) { testStringsUTF8 = append(testStringsUTF8, test) } } } func testString(s string, f func(byte) bool) bool { for i := range s { if !f(s[i]) { return false } } return true } func testValid(s string) bool { return testString(s, ValidByte) } func testValidPrint(s string) bool { return testString(s, ValidPrintByte) } func TestValid(t *testing.T) { buf := newBuffer(t, 1024) for _, input := range [2][]byte{buf.ProtectHead(), buf.ProtectTail()} { for i := 0; i < len(input); i++ { in := input[:i+1] for b := 0; b <= 0xFF; b++ { in[i] = byte(b) if b < 0x80 { if !Valid(in) { t.Errorf("should be valid: %v", in) } } else { if Valid(in) { t.Errorf("should not be valid: %v", in) } } in[i] = 'x' } } } } func TestValidPrint(t *testing.T) { buf := newBuffer(t, 1024) for _, input := range [2][]byte{buf.ProtectHead(), buf.ProtectTail()} { for i := 0; i < len(input); i++ { in := input[:i+1] for b := 0; b <= 0xFF; b++ { in[i] = byte(b) if ' ' <= b && b <= '~' { if !ValidPrint(in) { t.Errorf("should be valid: %v", in) } } else { if ValidPrint(in) { t.Errorf("should not be valid: %v", in) } } in[i] = 'x' } } } } func TestValidString(t *testing.T) { testValidationFunction(t, testValid, ValidString) } func TestValidPrintString(t *testing.T) { testValidationFunction(t, testValidPrint, ValidPrintString) } func testValidationFunction(t *testing.T, reference, function func(string) bool) { for _, test := range testStrings { t.Run(limit(test), func(t *testing.T) { expect := reference(test) if valid := function(test); expect != valid { t.Errorf("expected %t but got %t", expect, valid) } }) } } func BenchmarkValid(b *testing.B) { benchmarkValidationFunction(b, ValidString) } func BenchmarkValidPrint(b *testing.B) { benchmarkValidationFunction(b, ValidPrintString) } func benchmarkValidationFunction(b *testing.B, function func(string) bool) { for _, test := range testStrings { b.Run(limit(test), func(b *testing.B) { for i := 0; i < b.N; i++ { _ = function(test) } b.SetBytes(int64(len(test))) }) } } func TestHasPrefixFoldString(t *testing.T) { for _, test := range testStringsUTF8 { t.Run(limit(test), func(t *testing.T) { prefix := test if len(prefix) > 0 { prefix = prefix[:len(prefix)/2] } upper := strings.ToUpper(prefix) lower := strings.ToLower(prefix) if !HasPrefixFoldString(test, prefix) { t.Errorf("%q does not match %q", test, prefix) } if !HasPrefixFoldString(test, upper) { t.Errorf("%q does not match %q", test, upper) } if !HasPrefixFoldString(test, lower) { t.Errorf("%q does not match %q", test, lower) } }) } } func TestHasSuffixFoldString(t *testing.T) { for _, test := range testStringsUTF8 { t.Run(limit(test), func(t *testing.T) { suffix := test if len(suffix) > 0 { suffix = suffix[len(suffix)/2:] } upper := strings.ToUpper(suffix) lower := strings.ToLower(suffix) if !HasSuffixFoldString(test, suffix) { t.Errorf("%q does not match %q", test, suffix) } if !HasSuffixFoldString(test, upper) { t.Errorf("%q does not match %q", test, upper) } if !HasSuffixFoldString(test, lower) { t.Errorf("%q does not match %q", test, lower) } }) } } func TestEqualFoldString(t *testing.T) { // Only test valid UTF-8 otherwise ToUpper/ToLower will convert invalid // characters to UTF-8 placeholders, which breaks the case-insensitive // equality. for _, test := range testStringsUTF8 { t.Run(limit(test), func(t *testing.T) { upper := strings.ToUpper(test) lower := strings.ToLower(test) if !EqualFoldString(test, test) { t.Errorf("%q does not match %q", test, test) } if !EqualFoldString(test, upper) { t.Errorf("%q does not match %q", test, upper) } if !EqualFoldString(test, lower) { t.Errorf("%q does not match %q", test, lower) } if len(test) > 1 { reverse := make([]byte, len(test)) for i := range reverse { reverse[i] = test[len(test)-(i+1)] } if EqualFoldString(test, string(reverse)) { t.Errorf("%q matches %q", test, reverse) } } }) } } func newBuffer(t *testing.T, n int) buffer.Buffer { buf, err := buffer.New(n) if err != nil { t.Fatal(err) } return buf } func TestEqualFold(t *testing.T) { ubuf := newBuffer(t, 1024) defer ubuf.Release() lbuf := newBuffer(t, 1024) defer lbuf.Release() mbuf := newBuffer(t, 1024) defer mbuf.Release() upper := ubuf.ProtectHead() lower := lbuf.ProtectTail() mixed := mbuf.ProtectHead() for i := 0; i < len(upper); i++ { u := upper[:i+1] l := lower[:i+1] m := mixed[:i+1] u[i] = 'Z' l[i] = 'a' if EqualFold(l, u) { t.Errorf("%q matches %q", l, u) } u[i] = byte(i % 128) l[i] = byte(i % 128) if 'A' <= l[i] && l[i] <= 'Z' { l[i] += 32 } if 'a' <= u[i] && u[i] <= 'z' { u[i] -= 32 } if rand.Int()%2 == 0 { m[i] = l[i] } else { m[i] = u[i] } if !EqualFold(l, u) { t.Errorf("%q does not match %q", l, u) } if !EqualFold(l, m) { t.Errorf("%q does not match %q", l, m) } if !EqualFold(u, m) { t.Errorf("%q does not match %q", u, m) } } } func genValidString(n int, ch byte) (s string) { for i := 0; i < n; i++ { s += string(byte(i%26) + ch) } return } func genEqualStrings(n int) (l string, u string) { return genValidString(n, 'A'), genValidString(n, 'a') } func BenchmarkEqualFoldString(b *testing.B) { sizes := [...]int{7, 8, 9, 15, 16, 17, 31, 32, 33, 512, 2000} for _, s := range sizes { lower, upper := genEqualStrings(s) b.Run(fmt.Sprintf("%04d", s), func(b *testing.B) { for i := 0; i < b.N; i++ { EqualFoldString(lower, upper) } b.SetBytes(int64(len(lower) + len(upper))) }) } } func BenchmarkValidString(b *testing.B) { sizes := [...]int{7, 8, 9, 15, 16, 17, 31, 32, 33, 512, 2000} for _, s := range sizes { str := genValidString(s, 'a') b.Run(fmt.Sprintf("%04d", s), func(b *testing.B) { for i := 0; i < b.N; i++ { ValidString(str) } b.SetBytes(int64(s)) }) } } func BenchmarkValidPrintString(b *testing.B) { sizes := [...]int{7, 8, 9, 15, 16, 17, 31, 32, 33, 512, 2000} for _, s := range sizes { str := genValidString(s, 'a') b.Run(fmt.Sprintf("%04d", s), func(b *testing.B) { for i := 0; i < b.N; i++ { ValidPrintString(str) } b.SetBytes(int64(s)) }) } } func limit(s string) string { if len(s) > 17 { return s[:17] + "..." } return s } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/equal_fold.go000066400000000000000000000016161452252572700253660ustar00rootroot00000000000000package ascii import ( "github.com/segmentio/asm/internal/unsafebytes" ) // EqualFold is a version of bytes.EqualFold designed to work on ASCII input // instead of UTF-8. // // When the program has guarantees that the input is composed of ASCII // characters only, it allows for greater optimizations. func EqualFold(a, b []byte) bool { return EqualFoldString(unsafebytes.String(a), unsafebytes.String(b)) } func HasPrefixFold(s, prefix []byte) bool { return len(s) >= len(prefix) && EqualFold(s[:len(prefix)], prefix) } func HasSuffixFold(s, suffix []byte) bool { return len(s) >= len(suffix) && EqualFold(s[len(s)-len(suffix):], suffix) } func HasPrefixFoldString(s, prefix string) bool { return len(s) >= len(prefix) && EqualFoldString(s[:len(prefix)], prefix) } func HasSuffixFoldString(s, suffix string) bool { return len(s) >= len(suffix) && EqualFoldString(s[len(s)-len(suffix):], suffix) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/equal_fold_amd64.go000066400000000000000000000007241452252572700263600ustar00rootroot00000000000000// Code generated by command: go run equal_fold_asm.go -pkg ascii -out ../ascii/equal_fold_amd64.s -stubs ../ascii/equal_fold_amd64.go. DO NOT EDIT. //go:build !purego package ascii // EqualFoldString is a version of strings.EqualFold designed to work on ASCII // input instead of UTF-8. // // When the program has guarantees that the input is composed of ASCII // characters only, it allows for greater optimizations. func EqualFoldString(a string, b string) bool golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/equal_fold_amd64.s000066400000000000000000000141711452252572700262160ustar00rootroot00000000000000// Code generated by command: go run equal_fold_asm.go -pkg ascii -out ../ascii/equal_fold_amd64.s -stubs ../ascii/equal_fold_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func EqualFoldString(a string, b string) bool // Requires: AVX, AVX2, SSE4.1 TEXT ·EqualFoldString(SB), NOSPLIT, $0-33 MOVQ a_base+0(FP), CX MOVQ a_len+8(FP), DX MOVQ b_base+16(FP), BX CMPQ DX, b_len+24(FP) JNE done XORQ AX, AX CMPQ DX, $0x10 JB init_x86 BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS init_avx init_x86: LEAQ github·com∕segmentio∕asm∕ascii·lowerCase+0(SB), R9 XORL SI, SI cmp8: CMPQ DX, $0x08 JB cmp7 MOVBLZX (CX)(AX*1), DI MOVBLZX (BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI MOVBLZX 1(CX)(AX*1), DI MOVBLZX 1(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI MOVBLZX 2(CX)(AX*1), DI MOVBLZX 2(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI MOVBLZX 3(CX)(AX*1), DI MOVBLZX 3(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI MOVBLZX 4(CX)(AX*1), DI MOVBLZX 4(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI MOVBLZX 5(CX)(AX*1), DI MOVBLZX 5(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI MOVBLZX 6(CX)(AX*1), DI MOVBLZX 6(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI MOVBLZX 7(CX)(AX*1), DI MOVBLZX 7(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI JNE done ADDQ $0x08, AX SUBQ $0x08, DX JMP cmp8 cmp7: CMPQ DX, $0x07 JB cmp6 MOVBLZX 6(CX)(AX*1), DI MOVBLZX 6(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI cmp6: CMPQ DX, $0x06 JB cmp5 MOVBLZX 5(CX)(AX*1), DI MOVBLZX 5(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI cmp5: CMPQ DX, $0x05 JB cmp4 MOVBLZX 4(CX)(AX*1), DI MOVBLZX 4(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI cmp4: CMPQ DX, $0x04 JB cmp3 MOVBLZX 3(CX)(AX*1), DI MOVBLZX 3(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI cmp3: CMPQ DX, $0x03 JB cmp2 MOVBLZX 2(CX)(AX*1), DI MOVBLZX 2(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI cmp2: CMPQ DX, $0x02 JB cmp1 MOVBLZX 1(CX)(AX*1), DI MOVBLZX 1(BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI cmp1: CMPQ DX, $0x01 JB success MOVBLZX (CX)(AX*1), DI MOVBLZX (BX)(AX*1), R8 MOVB (R9)(DI*1), DI XORB (R9)(R8*1), DI ORB DI, SI done: SETEQ ret+32(FP) RET success: MOVB $0x01, ret+32(FP) RET init_avx: MOVB $0x20, SI PINSRB $0x00, SI, X12 VPBROADCASTB X12, Y12 MOVB $0x1f, SI PINSRB $0x00, SI, X13 VPBROADCASTB X13, Y13 MOVB $0x9a, SI PINSRB $0x00, SI, X14 VPBROADCASTB X14, Y14 MOVB $0x01, SI PINSRB $0x00, SI, X15 VPBROADCASTB X15, Y15 cmp128: CMPQ DX, $0x80 JB cmp64 VMOVDQU (CX)(AX*1), Y0 VMOVDQU 32(CX)(AX*1), Y1 VMOVDQU 64(CX)(AX*1), Y2 VMOVDQU 96(CX)(AX*1), Y3 VMOVDQU (BX)(AX*1), Y4 VMOVDQU 32(BX)(AX*1), Y5 VMOVDQU 64(BX)(AX*1), Y6 VMOVDQU 96(BX)(AX*1), Y7 VXORPD Y0, Y4, Y4 VPCMPEQB Y12, Y4, Y8 VORPD Y12, Y0, Y0 VPADDB Y13, Y0, Y0 VPCMPGTB Y0, Y14, Y0 VPAND Y8, Y0, Y0 VPAND Y15, Y0, Y0 VPSLLW $0x05, Y0, Y0 VPCMPEQB Y4, Y0, Y0 VXORPD Y1, Y5, Y5 VPCMPEQB Y12, Y5, Y9 VORPD Y12, Y1, Y1 VPADDB Y13, Y1, Y1 VPCMPGTB Y1, Y14, Y1 VPAND Y9, Y1, Y1 VPAND Y15, Y1, Y1 VPSLLW $0x05, Y1, Y1 VPCMPEQB Y5, Y1, Y1 VXORPD Y2, Y6, Y6 VPCMPEQB Y12, Y6, Y10 VORPD Y12, Y2, Y2 VPADDB Y13, Y2, Y2 VPCMPGTB Y2, Y14, Y2 VPAND Y10, Y2, Y2 VPAND Y15, Y2, Y2 VPSLLW $0x05, Y2, Y2 VPCMPEQB Y6, Y2, Y2 VXORPD Y3, Y7, Y7 VPCMPEQB Y12, Y7, Y11 VORPD Y12, Y3, Y3 VPADDB Y13, Y3, Y3 VPCMPGTB Y3, Y14, Y3 VPAND Y11, Y3, Y3 VPAND Y15, Y3, Y3 VPSLLW $0x05, Y3, Y3 VPCMPEQB Y7, Y3, Y3 VPAND Y1, Y0, Y0 VPAND Y3, Y2, Y2 VPAND Y2, Y0, Y0 ADDQ $0x80, AX SUBQ $0x80, DX VPMOVMSKB Y0, SI XORL $0xffffffff, SI JNE done JMP cmp128 cmp64: CMPQ DX, $0x40 JB cmp32 VMOVDQU (CX)(AX*1), Y0 VMOVDQU 32(CX)(AX*1), Y1 VMOVDQU (BX)(AX*1), Y2 VMOVDQU 32(BX)(AX*1), Y3 VXORPD Y0, Y2, Y2 VPCMPEQB Y12, Y2, Y4 VORPD Y12, Y0, Y0 VPADDB Y13, Y0, Y0 VPCMPGTB Y0, Y14, Y0 VPAND Y4, Y0, Y0 VPAND Y15, Y0, Y0 VPSLLW $0x05, Y0, Y0 VPCMPEQB Y2, Y0, Y0 VXORPD Y1, Y3, Y3 VPCMPEQB Y12, Y3, Y5 VORPD Y12, Y1, Y1 VPADDB Y13, Y1, Y1 VPCMPGTB Y1, Y14, Y1 VPAND Y5, Y1, Y1 VPAND Y15, Y1, Y1 VPSLLW $0x05, Y1, Y1 VPCMPEQB Y3, Y1, Y1 VPAND Y1, Y0, Y0 ADDQ $0x40, AX SUBQ $0x40, DX VPMOVMSKB Y0, SI XORL $0xffffffff, SI JNE done cmp32: CMPQ DX, $0x20 JB cmp16 VMOVDQU (CX)(AX*1), Y0 VMOVDQU (BX)(AX*1), Y1 VXORPD Y0, Y1, Y1 VPCMPEQB Y12, Y1, Y2 VORPD Y12, Y0, Y0 VPADDB Y13, Y0, Y0 VPCMPGTB Y0, Y14, Y0 VPAND Y2, Y0, Y0 VPAND Y15, Y0, Y0 VPSLLW $0x05, Y0, Y0 VPCMPEQB Y1, Y0, Y0 ADDQ $0x20, AX SUBQ $0x20, DX VPMOVMSKB Y0, SI XORL $0xffffffff, SI JNE done cmp16: CMPQ DX, $0x10 JLE cmp_tail VMOVDQU (CX)(AX*1), X0 VMOVDQU (BX)(AX*1), X1 VXORPD X0, X1, X1 VPCMPEQB X12, X1, X2 VORPD X12, X0, X0 VPADDB X13, X0, X0 VPCMPGTB X0, X14, X0 VPAND X2, X0, X0 VPAND X15, X0, X0 VPSLLW $0x05, X0, X0 VPCMPEQB X1, X0, X0 ADDQ $0x10, AX SUBQ $0x10, DX VPMOVMSKB X0, SI XORL $0x0000ffff, SI JNE done cmp_tail: SUBQ $0x10, DX ADDQ DX, AX VMOVDQU (CX)(AX*1), X0 VMOVDQU (BX)(AX*1), X1 VXORPD X0, X1, X1 VPCMPEQB X12, X1, X2 VORPD X12, X0, X0 VPADDB X13, X0, X0 VPCMPGTB X0, X14, X0 VPAND X2, X0, X0 VPAND X15, X0, X0 VPSLLW $0x05, X0, X0 VPCMPEQB X1, X0, X0 VPMOVMSKB X0, AX XORL $0x0000ffff, AX JMP done golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/equal_fold_default.go000066400000000000000000000024501452252572700270670ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package ascii // EqualFoldString is a version of strings.EqualFold designed to work on ASCII // input instead of UTF-8. // // When the program has guarantees that the input is composed of ASCII // characters only, it allows for greater optimizations. func EqualFoldString(a, b string) bool { if len(a) != len(b) { return false } var cmp byte for len(a) >= 8 { cmp |= lowerCase[a[0]] ^ lowerCase[b[0]] cmp |= lowerCase[a[1]] ^ lowerCase[b[1]] cmp |= lowerCase[a[2]] ^ lowerCase[b[2]] cmp |= lowerCase[a[3]] ^ lowerCase[b[3]] cmp |= lowerCase[a[4]] ^ lowerCase[b[4]] cmp |= lowerCase[a[5]] ^ lowerCase[b[5]] cmp |= lowerCase[a[6]] ^ lowerCase[b[6]] cmp |= lowerCase[a[7]] ^ lowerCase[b[7]] if cmp != 0 { return false } a = a[8:] b = b[8:] } switch len(a) { case 7: cmp |= lowerCase[a[6]] ^ lowerCase[b[6]] fallthrough case 6: cmp |= lowerCase[a[5]] ^ lowerCase[b[5]] fallthrough case 5: cmp |= lowerCase[a[4]] ^ lowerCase[b[4]] fallthrough case 4: cmp |= lowerCase[a[3]] ^ lowerCase[b[3]] fallthrough case 3: cmp |= lowerCase[a[2]] ^ lowerCase[b[2]] fallthrough case 2: cmp |= lowerCase[a[1]] ^ lowerCase[b[1]] fallthrough case 1: cmp |= lowerCase[a[0]] ^ lowerCase[b[0]] } return cmp == 0 } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/valid.go000066400000000000000000000006371452252572700243540ustar00rootroot00000000000000package ascii import "github.com/segmentio/asm/internal/unsafebytes" // Valid returns true if b contains only ASCII characters. func Valid(b []byte) bool { return ValidString(unsafebytes.String(b)) } // ValidBytes returns true if b is an ASCII character. func ValidByte(b byte) bool { return b <= 0x7f } // ValidBytes returns true if b is an ASCII character. func ValidRune(r rune) bool { return r <= 0x7f } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/valid_amd64.go000066400000000000000000000004131452252572700253370ustar00rootroot00000000000000// Code generated by command: go run valid_asm.go -pkg ascii -out ../ascii/valid_amd64.s -stubs ../ascii/valid_amd64.go. DO NOT EDIT. //go:build !purego package ascii // ValidString returns true if s contains only ASCII characters. func ValidString(s string) bool golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/valid_amd64.s000066400000000000000000000041401452252572700251750ustar00rootroot00000000000000// Code generated by command: go run valid_asm.go -pkg ascii -out ../ascii/valid_amd64.s -stubs ../ascii/valid_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func ValidString(s string) bool // Requires: AVX, AVX2, SSE4.1 TEXT ·ValidString(SB), NOSPLIT, $0-17 MOVQ s_base+0(FP), AX MOVQ s_len+8(FP), CX MOVQ $0x8080808080808080, DX CMPQ CX, $0x10 JB cmp8 BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS init_avx cmp8: CMPQ CX, $0x08 JB cmp4 TESTQ DX, (AX) JNZ invalid ADDQ $0x08, AX SUBQ $0x08, CX JMP cmp8 cmp4: CMPQ CX, $0x04 JB cmp3 TESTL $0x80808080, (AX) JNZ invalid ADDQ $0x04, AX SUBQ $0x04, CX cmp3: CMPQ CX, $0x03 JB cmp2 MOVWLZX (AX), CX MOVBLZX 2(AX), AX SHLL $0x10, AX ORL CX, AX TESTL $0x80808080, AX JMP done cmp2: CMPQ CX, $0x02 JB cmp1 TESTW $0x8080, (AX) JMP done cmp1: CMPQ CX, $0x00 JE done TESTB $0x80, (AX) done: SETEQ ret+16(FP) RET invalid: MOVB $0x00, ret+16(FP) RET init_avx: PINSRQ $0x00, DX, X4 VPBROADCASTQ X4, Y4 cmp256: CMPQ CX, $0x00000100 JB cmp128 VMOVDQU (AX), Y0 VPOR 32(AX), Y0, Y0 VMOVDQU 64(AX), Y1 VPOR 96(AX), Y1, Y1 VMOVDQU 128(AX), Y2 VPOR 160(AX), Y2, Y2 VMOVDQU 192(AX), Y3 VPOR 224(AX), Y3, Y3 VPOR Y1, Y0, Y0 VPOR Y3, Y2, Y2 VPOR Y2, Y0, Y0 VPTEST Y0, Y4 JNZ invalid ADDQ $0x00000100, AX SUBQ $0x00000100, CX JMP cmp256 cmp128: CMPQ CX, $0x80 JB cmp64 VMOVDQU (AX), Y0 VPOR 32(AX), Y0, Y0 VMOVDQU 64(AX), Y1 VPOR 96(AX), Y1, Y1 VPOR Y1, Y0, Y0 VPTEST Y0, Y4 JNZ invalid ADDQ $0x80, AX SUBQ $0x80, CX cmp64: CMPQ CX, $0x40 JB cmp32 VMOVDQU (AX), Y0 VPOR 32(AX), Y0, Y0 VPTEST Y0, Y4 JNZ invalid ADDQ $0x40, AX SUBQ $0x40, CX cmp32: CMPQ CX, $0x20 JB cmp16 VPTEST (AX), Y4 JNZ invalid ADDQ $0x20, AX SUBQ $0x20, CX cmp16: CMPQ CX, $0x10 JLE cmp_tail VPTEST (AX), X4 JNZ invalid ADDQ $0x10, AX SUBQ $0x10, CX cmp_tail: SUBQ $0x10, CX ADDQ CX, AX VPTEST (AX), X4 JMP done golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/valid_default.go000066400000000000000000000015121452252572700260510ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package ascii import ( "unsafe" ) // ValidString returns true if s contains only ASCII characters. func ValidString(s string) bool { p := *(*unsafe.Pointer)(unsafe.Pointer(&s)) i := uintptr(0) n := uintptr(len(s)) for i+8 <= n { if (*(*uint64)(unsafe.Pointer(uintptr(p) + i)) & 0x8080808080808080) != 0 { return false } i += 8 } if i+4 <= n { if (*(*uint32)(unsafe.Pointer(uintptr(p) + i)) & 0x80808080) != 0 { return false } i += 4 } if i == n { return true } p = unsafe.Pointer(uintptr(p) + i) var x uint32 switch n - i { case 3: x = uint32(*(*uint16)(p)) | uint32(*(*uint8)(unsafe.Pointer(uintptr(p) + 2)))<<16 case 2: x = uint32(*(*uint16)(p)) case 1: x = uint32(*(*uint8)(p)) default: return true } return (x & 0x80808080) == 0 } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/valid_print.go000066400000000000000000000007461452252572700255710ustar00rootroot00000000000000package ascii import "github.com/segmentio/asm/internal/unsafebytes" // ValidPrint returns true if b contains only printable ASCII characters. func ValidPrint(b []byte) bool { return ValidPrintString(unsafebytes.String(b)) } // ValidPrintBytes returns true if b is an ASCII character. func ValidPrintByte(b byte) bool { return 0x20 <= b && b <= 0x7e } // ValidPrintBytes returns true if b is an ASCII character. func ValidPrintRune(r rune) bool { return 0x20 <= r && r <= 0x7e } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/valid_print_amd64.go000066400000000000000000000004611452252572700265560ustar00rootroot00000000000000// Code generated by command: go run valid_print_asm.go -pkg ascii -out ../ascii/valid_print_amd64.s -stubs ../ascii/valid_print_amd64.go. DO NOT EDIT. //go:build !purego package ascii // ValidPrintString returns true if s contains only printable ASCII characters. func ValidPrintString(s string) bool golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/valid_print_amd64.s000066400000000000000000000063771452252572700264270ustar00rootroot00000000000000// Code generated by command: go run valid_print_asm.go -pkg ascii -out ../ascii/valid_print_amd64.s -stubs ../ascii/valid_print_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func ValidPrintString(s string) bool // Requires: AVX, AVX2, SSE4.1 TEXT ·ValidPrintString(SB), NOSPLIT, $0-17 MOVQ s_base+0(FP), AX MOVQ s_len+8(FP), CX CMPQ CX, $0x10 JB init_x86 BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS init_avx init_x86: CMPQ CX, $0x08 JB cmp4 MOVQ $0xdfdfdfdfdfdfdfe0, DX MOVQ $0x0101010101010101, BX MOVQ $0x8080808080808080, SI cmp8: MOVQ (AX), DI MOVQ DI, R8 LEAQ (DI)(DX*1), R9 NOTQ R8 ANDQ R8, R9 LEAQ (DI)(BX*1), R8 ORQ R8, DI ORQ R9, DI ADDQ $0x08, AX SUBQ $0x08, CX TESTQ SI, DI JNE done CMPQ CX, $0x08 JB cmp4 JMP cmp8 cmp4: CMPQ CX, $0x04 JB cmp3 MOVL (AX), DX MOVL DX, BX LEAL 3755991008(DX), SI NOTL BX ANDL BX, SI LEAL 16843009(DX), BX ORL BX, DX ORL SI, DX ADDQ $0x04, AX SUBQ $0x04, CX TESTL $0x80808080, DX JNE done cmp3: CMPQ CX, $0x03 JB cmp2 MOVWLZX (AX), DX MOVBLZX 2(AX), AX SHLL $0x10, AX ORL DX, AX ORL $0x20000000, AX JMP final cmp2: CMPQ CX, $0x02 JB cmp1 MOVWLZX (AX), AX ORL $0x20200000, AX JMP final cmp1: CMPQ CX, $0x00 JE done MOVBLZX (AX), AX ORL $0x20202000, AX final: MOVL AX, CX LEAL 3755991008(AX), DX NOTL CX ANDL CX, DX LEAL 16843009(AX), CX ORL CX, AX ORL DX, AX TESTL $0x80808080, AX done: SETEQ ret+16(FP) RET init_avx: MOVB $0x1f, DL PINSRB $0x00, DX, X8 VPBROADCASTB X8, Y8 MOVB $0x7e, DL PINSRB $0x00, DX, X9 VPBROADCASTB X9, Y9 cmp128: CMPQ CX, $0x80 JB cmp64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y1 VMOVDQU 64(AX), Y2 VMOVDQU 96(AX), Y3 VPCMPGTB Y8, Y0, Y4 VPCMPGTB Y9, Y0, Y0 VPANDN Y4, Y0, Y0 VPCMPGTB Y8, Y1, Y5 VPCMPGTB Y9, Y1, Y1 VPANDN Y5, Y1, Y1 VPCMPGTB Y8, Y2, Y6 VPCMPGTB Y9, Y2, Y2 VPANDN Y6, Y2, Y2 VPCMPGTB Y8, Y3, Y7 VPCMPGTB Y9, Y3, Y3 VPANDN Y7, Y3, Y3 VPAND Y1, Y0, Y0 VPAND Y3, Y2, Y2 VPAND Y2, Y0, Y0 ADDQ $0x80, AX SUBQ $0x80, CX VPMOVMSKB Y0, DX XORL $0xffffffff, DX JNE done JMP cmp128 cmp64: CMPQ CX, $0x40 JB cmp32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y1 VPCMPGTB Y8, Y0, Y2 VPCMPGTB Y9, Y0, Y0 VPANDN Y2, Y0, Y0 VPCMPGTB Y8, Y1, Y3 VPCMPGTB Y9, Y1, Y1 VPANDN Y3, Y1, Y1 VPAND Y1, Y0, Y0 ADDQ $0x40, AX SUBQ $0x40, CX VPMOVMSKB Y0, DX XORL $0xffffffff, DX JNE done cmp32: CMPQ CX, $0x20 JB cmp16 VMOVDQU (AX), Y0 VPCMPGTB Y8, Y0, Y1 VPCMPGTB Y9, Y0, Y0 VPANDN Y1, Y0, Y0 ADDQ $0x20, AX SUBQ $0x20, CX VPMOVMSKB Y0, DX XORL $0xffffffff, DX JNE done cmp16: CMPQ CX, $0x10 JLE cmp_tail VMOVDQU (AX), X0 VPCMPGTB X8, X0, X1 VPCMPGTB X9, X0, X0 VPANDN X1, X0, X0 ADDQ $0x10, AX SUBQ $0x10, CX VPMOVMSKB X0, DX XORL $0x0000ffff, DX JNE done cmp_tail: SUBQ $0x10, CX ADDQ CX, AX VMOVDQU (AX), X0 VPCMPGTB X8, X0, X1 VPCMPGTB X9, X0, X0 VPANDN X1, X0, X0 VPMOVMSKB X0, DX XORL $0x0000ffff, DX JMP done golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/ascii/valid_print_default.go000066400000000000000000000020011452252572700272570ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package ascii import "unsafe" // ValidString returns true if s contains only printable ASCII characters. func ValidPrintString(s string) bool { p := *(*unsafe.Pointer)(unsafe.Pointer(&s)) i := uintptr(0) n := uintptr(len(s)) for i+8 <= n { if hasLess64(*(*uint64)(unsafe.Pointer(uintptr(p) + i)), 0x20) || hasMore64(*(*uint64)(unsafe.Pointer(uintptr(p) + i)), 0x7e) { return false } i += 8 } if i+4 <= n { if hasLess32(*(*uint32)(unsafe.Pointer(uintptr(p) + i)), 0x20) || hasMore32(*(*uint32)(unsafe.Pointer(uintptr(p) + i)), 0x7e) { return false } i += 4 } if i == n { return true } p = unsafe.Pointer(uintptr(p) + i) var x uint32 switch n - i { case 3: x = 0x20000000 | uint32(*(*uint16)(p)) | uint32(*(*uint8)(unsafe.Pointer(uintptr(p) + 2)))<<16 case 2: x = 0x20200000 | uint32(*(*uint16)(p)) case 1: x = 0x20202000 | uint32(*(*uint8)(p)) default: return true } return !(hasLess32(x, 0x20) || hasMore32(x, 0x7e)) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/000077500000000000000000000000001452252572700227145ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/base64.go000066400000000000000000000044321452252572700243320ustar00rootroot00000000000000package base64 import ( "encoding/base64" ) const ( StdPadding rune = base64.StdPadding NoPadding rune = base64.NoPadding encodeStd = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" encodeURL = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" encodeIMAP = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+," letterRange = int8('Z' - 'A' + 1) ) // StdEncoding is the standard base64 encoding, as defined in RFC 4648. var StdEncoding = NewEncoding(encodeStd) // URLEncoding is the alternate base64 encoding defined in RFC 4648. // It is typically used in URLs and file names. var URLEncoding = NewEncoding(encodeURL) // RawStdEncoding is the standard unpadded base64 encoding defined in RFC 4648 section 3.2. // This is the same as StdEncoding but omits padding characters. var RawStdEncoding = StdEncoding.WithPadding(NoPadding) // RawURLEncoding is the unpadded alternate base64 encoding defined in RFC 4648. // This is the same as URLEncoding but omits padding characters. var RawURLEncoding = URLEncoding.WithPadding(NoPadding) // NewEncoding returns a new padded Encoding defined by the given alphabet, // which must be a 64-byte string that does not contain the padding character // or CR / LF ('\r', '\n'). Unlike the standard library, the encoding alphabet // cannot be abitrary, and it must follow one of the know standard encoding // variants. // // Required alphabet values: // * [0,26): characters 'A'..'Z' // * [26,52): characters 'a'..'z' // * [52,62): characters '0'..'9' // Flexible alphabet value options: // * RFC 4648, RFC 1421, RFC 2045, RFC 2152, RFC 4880: '+' and '/' // * RFC 4648 URI: '-' and '_' // * RFC 3501: '+' and ',' // // The resulting Encoding uses the default padding character ('='), which may // be changed or disabled via WithPadding. The padding characters is urestricted, // but it must be a character outside of the encoder alphabet. func NewEncoding(encoder string) *Encoding { if len(encoder) != 64 { panic("encoding alphabet is not 64-bytes long") } if _, ok := allowedEncoding[encoder]; !ok { panic("non-standard encoding alphabets are not supported") } return newEncoding(encoder) } var allowedEncoding = map[string]struct{}{ encodeStd: {}, encodeURL: {}, encodeIMAP: {}, } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/base64_amd64.go000066400000000000000000000040251452252572700253230ustar00rootroot00000000000000//go:build amd64 && !purego // +build amd64,!purego package base64 import ( "encoding/base64" "github.com/segmentio/asm/cpu" "github.com/segmentio/asm/cpu/x86" ) const ( encLutSize = 32 decLutSize = 48 minEncodeLen = 28 minDecodeLen = 45 ) func newEncoding(encoder string) *Encoding { e := &Encoding{base: base64.NewEncoding(encoder)} if cpu.X86.Has(x86.AVX2) { e.enableEncodeAVX2(encoder) e.enableDecodeAVX2(encoder) } return e } func (e *Encoding) enableEncodeAVX2(encoder string) { // Translate values 0..63 to the Base64 alphabet. There are five sets: // // From To Add Index Example // [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ // [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz // [52..61] [48..57] -4 [2..11] 0123456789 // [62] [43] -19 12 + // [63] [47] -16 13 / tab := [encLutSize]int8{int8(encoder[0]), int8(encoder[letterRange]) - letterRange} for i, ch := range encoder[2*letterRange:] { tab[2+i] = int8(ch) - 2*letterRange - int8(i) } e.enc = encodeAVX2 e.enclut = tab } func (e *Encoding) enableDecodeAVX2(encoder string) { c62, c63 := int8(encoder[62]), int8(encoder[63]) url := c63 == '_' if url { c63 = '/' } // Translate values from the Base64 alphabet using five sets. Values outside // of these ranges are considered invalid: // // From To Add Index Example // [47] [63] +16 1 / // [43] [62] +19 2 + // [48..57] [52..61] +4 3 0123456789 // [65..90] [0..25] -65 4,5 ABCDEFGHIJKLMNOPQRSTUVWXYZ // [97..122] [26..51] -71 6,7 abcdefghijklmnopqrstuvwxyz tab := [decLutSize]int8{ 0, 63 - c63, 62 - c62, 4, -65, -65, -71, -71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1B, 0x1B, 0x1B, 0x1B, 0x1B, } tab[(c62&15)+16] = 0x1A tab[(c63&15)+16] = 0x1A if url { e.dec = decodeAVX2URI } else { e.dec = decodeAVX2 } e.declut = tab } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/base64_arm64.go000066400000000000000000000015751452252572700253500ustar00rootroot00000000000000//go:build arm64 && !purego // +build arm64,!purego package base64 import ( "encoding/base64" ) const ( encLutSize = 16 decLutSize = 2 minEncodeLen = 16 * 3 minDecodeLen = 8 * 4 ) func newEncoding(encoder string) *Encoding { e := &Encoding{base: base64.NewEncoding(encoder)} e.enableEncodeARM64(encoder) e.enableDecodeARM64(encoder) return e } func (e *Encoding) enableEncodeARM64(encoder string) { c62, c63 := int8(encoder[62]), int8(encoder[63]) tab := [encLutSize]int8{ 'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, c62 - 62, c63 - 63, 'A', 0, 0, } e.enc = encodeARM64 e.enclut = tab } func (e *Encoding) enableDecodeARM64(encoder string) { if encoder == encodeStd { e.dec = decodeStdARM64 } else { e.dec = decodeARM64 } e.declut = [decLutSize]int8{int8(encoder[62]), int8(encoder[63])} } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/base64_asm.go000066400000000000000000000052071452252572700251730ustar00rootroot00000000000000//go:build (amd64 || arm64) && !purego // +build amd64 arm64 // +build !purego package base64 import ( "encoding/base64" "github.com/segmentio/asm/internal/unsafebytes" ) // An Encoding is a radix 64 encoding/decoding scheme, defined by a // 64-character alphabet. type Encoding struct { enc func(dst []byte, src []byte, lut *int8) (int, int) enclut [encLutSize]int8 dec func(dst []byte, src []byte, lut *int8) (int, int) declut [decLutSize]int8 base *base64.Encoding } // WithPadding creates a duplicate Encoding updated with a specified padding // character, or NoPadding to disable padding. The padding character must not // be contained in the encoding alphabet, must not be '\r' or '\n', and must // be no greater than '\xFF'. func (enc Encoding) WithPadding(padding rune) *Encoding { enc.base = enc.base.WithPadding(padding) return &enc } // Strict creates a duplicate encoding updated with strict decoding enabled. // This requires that trailing padding bits are zero. func (enc Encoding) Strict() *Encoding { enc.base = enc.base.Strict() return &enc } // Encode encodes src using the defined encoding alphabet. // This will write EncodedLen(len(src)) bytes to dst. func (enc *Encoding) Encode(dst, src []byte) { if len(src) >= minEncodeLen && enc.enc != nil { d, s := enc.enc(dst, src, &enc.enclut[0]) dst = dst[d:] src = src[s:] } enc.base.Encode(dst, src) } // Encode encodes src using the encoding enc, writing // EncodedLen(len(src)) bytes to dst. func (enc *Encoding) EncodeToString(src []byte) string { buf := make([]byte, enc.base.EncodedLen(len(src))) enc.Encode(buf, src) return string(buf) } // EncodedLen calculates the base64-encoded byte length for a message // of length n. func (enc *Encoding) EncodedLen(n int) int { return enc.base.EncodedLen(n) } // Decode decodes src using the defined encoding alphabet. // This will write DecodedLen(len(src)) bytes to dst and return the number of // bytes written. func (enc *Encoding) Decode(dst, src []byte) (n int, err error) { var d, s int if len(src) >= minDecodeLen && enc.dec != nil { d, s = enc.dec(dst, src, &enc.declut[0]) dst = dst[d:] src = src[s:] } n, err = enc.base.Decode(dst, src) n += d return } // DecodeString decodes the base64 encoded string s, returns the decoded // value as bytes. func (enc *Encoding) DecodeString(s string) ([]byte, error) { src := unsafebytes.BytesOf(s) dst := make([]byte, enc.base.DecodedLen(len(s))) n, err := enc.Decode(dst, src) return dst[:n], err } // DecodedLen calculates the decoded byte length for a base64-encoded message // of length n. func (enc *Encoding) DecodedLen(n int) int { return enc.base.DecodedLen(n) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/base64_asm_test.go000066400000000000000000000037651452252572700262410ustar00rootroot00000000000000//go:build (amd64 || arm64) && !purego // +build amd64 arm64 // +build !purego package base64 import ( "fmt" "testing" "github.com/segmentio/asm/internal/buffer" ) func fillBuffers(b *buffer.Buffer, size int) map[string][]byte { bufs := map[string][]byte{ "head": b.ProtectHead(), "tail": b.ProtectTail(), } for _, buf := range bufs { for i := 0; i < size; i++ { buf[i] = (255 - byte(i&15)*16) - byte(i&255)/16 } } return bufs } func TestEncodeASM(t *testing.T) { b, err := buffer.New(512) if err != nil { t.Fatal(err) } defer b.Release() bufs := fillBuffers(&b, 512) for _, enc := range encodings { if enc.candidate.enc == nil { t.Log("asm not enabled") continue } for name, buf := range bufs { ok := t.Run(fmt.Sprintf("%s-%s", enc.name, name), func(t *testing.T) { dst, err := buffer.New(enc.candidate.EncodedLen(len(buf))) if err != nil { t.Fatal(err) } defer dst.Release() _, ns := enc.candidate.enc(dst.ProtectTail(), buf, &enc.candidate.enclut[0]) if len(buf)-ns >= minEncodeLen { t.Errorf("encode remain should be less than %d, but is %d", minEncodeLen, len(buf)-ns) } }) if !ok { break } } } } func TestDecodeASM(t *testing.T) { b, err := buffer.New(512) if err != nil { t.Fatal(err) } defer b.Release() bufs := fillBuffers(&b, 512) for _, enc := range encodings { if enc.candidate.dec == nil { t.Log("asm not enabled") continue } for name, buf := range bufs { ok := t.Run(fmt.Sprintf("%s-%s", enc.name, name), func(t *testing.T) { src := make([]byte, enc.candidate.EncodedLen(len(buf))) dst, err := buffer.New(len(buf)) if err != nil { t.Fatal(err) } defer dst.Release() enc.candidate.Encode(src, buf) _, ns := enc.candidate.dec(dst.ProtectTail(), src, &enc.candidate.declut[0]) if len(buf)-ns >= minDecodeLen { t.Errorf("decode remain should be less than %d, but is %d", minDecodeLen, len(buf)-ns) } }) if !ok { break } } } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/base64_default.go000066400000000000000000000005031452252572700260310ustar00rootroot00000000000000//go:build purego || !(amd64 || arm64) // +build purego !amd64,!arm64 package base64 import "encoding/base64" // An Encoding is a radix 64 encoding/decoding scheme, defined by a // 64-character alphabet. type Encoding = base64.Encoding func newEncoding(encoder string) *Encoding { return base64.NewEncoding(encoder) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/base64_test.go000066400000000000000000000102701452252572700253660ustar00rootroot00000000000000package base64 import ( "bytes" "encoding/base64" "math/rand" "testing" ) var encodings = []struct { name string control *base64.Encoding candidate *Encoding }{ { name: "std", control: base64.StdEncoding, candidate: StdEncoding, }, { name: "url", control: base64.URLEncoding, candidate: URLEncoding, }, { name: "raw-std", control: base64.RawStdEncoding, candidate: RawStdEncoding, }, { name: "raw-url", control: base64.RawURLEncoding, candidate: RawURLEncoding, }, { name: "imap", control: base64.NewEncoding(encodeIMAP).WithPadding(NoPadding), candidate: NewEncoding(encodeIMAP).WithPadding(NoPadding), }, } func TestEncoding(t *testing.T) { for _, enc := range encodings { t.Run(enc.name, func(t *testing.T) { for i := 1; i < 1024; i++ { src := make([]byte, i) rand.Read(src) n := enc.control.EncodedLen(i) encExpect := make([]byte, n) encActual := make([]byte, n) enc.control.Encode(encExpect, src) enc.candidate.Encode(encActual, src) if !bytes.Equal(encExpect, encActual) { t.Errorf("failed encode:\n\texpect = %v\n\tactual = %v", encExpect, encActual) } n = enc.control.DecodedLen(n) decExpect := make([]byte, n) decActual := make([]byte, n) nControl, errControl := enc.control.Decode(decExpect, encExpect) nCandidate, errCandidate := enc.candidate.Decode(decActual, encActual) if errControl != nil { t.Fatalf("control decode error: %v", errControl) } if errCandidate != nil { t.Fatalf("candidate decode error: %v", errCandidate) } if nControl != nCandidate { t.Fatalf("failed decode length: expect = %d, actual = %d", nControl, nCandidate) } if !bytes.Equal(decExpect, decActual) { t.Fatalf("failed decode:\n\texpect = %v\n\tactual = %v", decExpect, decActual) } encString := enc.control.EncodeToString(src) decExpect, errControl = enc.control.DecodeString(encString) decActual, errCandidate = enc.candidate.DecodeString(encString) if errControl != nil { t.Fatalf("control decode error: %v", errControl) } if errCandidate != nil { t.Fatalf("candidate decode error: %v", errCandidate) } if !bytes.Equal(decExpect, decActual) { t.Fatalf("failed decode:\n\texpect = %v\n\tactual = %v", decExpect, decActual) } } }) } } func TestDecodeLines(t *testing.T) { src := []byte(`dGVzdCB0ZXN0IHRlc3QgdGVzdCB0ZXN0IHRlc3QgdGVzdCB0ZXN0IHRlc3QgdGVzdCB0ZXN0IHRl c3QgdGVzdCB0ZXN0IHRlc3QgdGVzdCB0ZXN0IHRlc3QgdGVzdCB0ZXN0IHRlc3QgdGVzdCB0ZXN0 IHRlc3QgdGVzdCB0ZXN0IHRlc3QgdGVzdCB0ZXN0IHRlc3QgdGVzdCB0ZXN0IHRlc3QgdGVzdCB0 ZXN0IHRlc3QgdGVzdCB0ZXN0IHRlc3QgdGVzdCB0ZXN0IHRlc3QgdGVzdCB0ZXN0IHRlc3QgdGVz dCB0ZXN0IHRlc3QgdGVzdA==`) expect := []byte(`test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test test`) actual := make([]byte, StdEncoding.DecodedLen(len(src))) n, err := StdEncoding.Decode(actual, src) if err != nil { t.Fatalf("decode error: %v", err) } if !bytes.Equal(expect, actual[:n]) { t.Errorf("failed decode:\n\texpect = %v\n\tactual = %v", expect, actual) } } func BenchmarkEncode(b *testing.B) { src := make([]byte, 4096) dst := make([]byte, base64.StdEncoding.EncodedLen(len(src))) rand.Read(src) b.Run("asm", func(b *testing.B) { for i := 0; i < b.N; i++ { StdEncoding.Encode(dst, src) } b.SetBytes(int64(len(src))) }) b.Run("go", func(b *testing.B) { for i := 0; i < b.N; i++ { base64.StdEncoding.Encode(dst, src) } b.SetBytes(int64(len(src))) }) } func BenchmarkDecode(b *testing.B) { raw := make([]byte, 4096) src := make([]byte, base64.StdEncoding.EncodedLen(len(raw))) dst := make([]byte, base64.StdEncoding.DecodedLen(len(src))) rand.Read(raw) base64.StdEncoding.Encode(src, raw) b.Run("asm", func(b *testing.B) { for i := 0; i < b.N; i++ { StdEncoding.Decode(dst, src) } b.SetBytes(int64(len(src))) }) b.Run("go", func(b *testing.B) { for i := 0; i < b.N; i++ { base64.StdEncoding.Decode(dst, src) } b.SetBytes(int64(len(src))) }) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/decode_amd64.go000066400000000000000000000004611452252572700254620ustar00rootroot00000000000000// Code generated by command: go run decode_asm.go -pkg base64 -out ../base64/decode_amd64.s -stubs ../base64/decode_amd64.go. DO NOT EDIT. //go:build !purego package base64 func decodeAVX2(dst []byte, src []byte, lut *int8) (int, int) func decodeAVX2URI(dst []byte, src []byte, lut *int8) (int, int) golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/decode_amd64.s000066400000000000000000000101011452252572700253070ustar00rootroot00000000000000// Code generated by command: go run decode_asm.go -pkg base64 -out ../base64/decode_amd64.s -stubs ../base64/decode_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" DATA b64_dec_lut_hi<>+0(SB)/8, $0x0804080402011010 DATA b64_dec_lut_hi<>+8(SB)/8, $0x1010101010101010 DATA b64_dec_lut_hi<>+16(SB)/8, $0x0804080402011010 DATA b64_dec_lut_hi<>+24(SB)/8, $0x1010101010101010 GLOBL b64_dec_lut_hi<>(SB), RODATA|NOPTR, $32 DATA b64_dec_madd1<>+0(SB)/8, $0x0140014001400140 DATA b64_dec_madd1<>+8(SB)/8, $0x0140014001400140 DATA b64_dec_madd1<>+16(SB)/8, $0x0140014001400140 DATA b64_dec_madd1<>+24(SB)/8, $0x0140014001400140 GLOBL b64_dec_madd1<>(SB), RODATA|NOPTR, $32 DATA b64_dec_madd2<>+0(SB)/8, $0x0001100000011000 DATA b64_dec_madd2<>+8(SB)/8, $0x0001100000011000 DATA b64_dec_madd2<>+16(SB)/8, $0x0001100000011000 DATA b64_dec_madd2<>+24(SB)/8, $0x0001100000011000 GLOBL b64_dec_madd2<>(SB), RODATA|NOPTR, $32 DATA b64_dec_shuf_lo<>+0(SB)/8, $0x0000000000000000 DATA b64_dec_shuf_lo<>+8(SB)/8, $0x0600010200000000 GLOBL b64_dec_shuf_lo<>(SB), RODATA|NOPTR, $16 DATA b64_dec_shuf<>+0(SB)/8, $0x090a040506000102 DATA b64_dec_shuf<>+8(SB)/8, $0x000000000c0d0e08 DATA b64_dec_shuf<>+16(SB)/8, $0x0c0d0e08090a0405 DATA b64_dec_shuf<>+24(SB)/8, $0x0000000000000000 GLOBL b64_dec_shuf<>(SB), RODATA|NOPTR, $32 // func decodeAVX2(dst []byte, src []byte, lut *int8) (int, int) // Requires: AVX, AVX2, SSE4.1 TEXT ·decodeAVX2(SB), NOSPLIT, $0-72 MOVQ dst_base+0(FP), AX MOVQ src_base+24(FP), DX MOVQ lut+48(FP), SI MOVQ src_len+32(FP), DI MOVB $0x2f, CL PINSRB $0x00, CX, X8 VPBROADCASTB X8, Y8 XORQ CX, CX XORQ BX, BX VPXOR Y7, Y7, Y7 VPERMQ $0x44, (SI), Y6 VPERMQ $0x44, 16(SI), Y4 VMOVDQA b64_dec_lut_hi<>+0(SB), Y5 loop: VMOVDQU (DX)(BX*1), Y0 VPSRLD $0x04, Y0, Y2 VPAND Y8, Y0, Y3 VPSHUFB Y3, Y4, Y3 VPAND Y8, Y2, Y2 VPSHUFB Y2, Y5, Y9 VPTEST Y9, Y3 JNE done VPCMPEQB Y8, Y0, Y3 VPADDB Y3, Y2, Y2 VPSHUFB Y2, Y6, Y2 VPADDB Y0, Y2, Y0 VPMADDUBSW b64_dec_madd1<>+0(SB), Y0, Y0 VPMADDWD b64_dec_madd2<>+0(SB), Y0, Y0 VEXTRACTI128 $0x01, Y0, X1 VPSHUFB b64_dec_shuf_lo<>+0(SB), X1, X1 VPSHUFB b64_dec_shuf<>+0(SB), Y0, Y0 VPBLENDD $0x08, Y1, Y0, Y1 VPBLENDD $0xc0, Y7, Y1, Y1 VMOVDQU Y1, (AX)(CX*1) ADDQ $0x18, CX ADDQ $0x20, BX SUBQ $0x20, DI CMPQ DI, $0x2d JB done JMP loop done: MOVQ CX, ret+56(FP) MOVQ BX, ret1+64(FP) VZEROUPPER RET // func decodeAVX2URI(dst []byte, src []byte, lut *int8) (int, int) // Requires: AVX, AVX2, SSE4.1 TEXT ·decodeAVX2URI(SB), NOSPLIT, $0-72 MOVB $0x2f, AL PINSRB $0x00, AX, X0 VPBROADCASTB X0, Y0 MOVB $0x5f, AL PINSRB $0x00, AX, X1 VPBROADCASTB X1, Y1 MOVQ dst_base+0(FP), AX MOVQ src_base+24(FP), DX MOVQ lut+48(FP), SI MOVQ src_len+32(FP), DI MOVB $0x2f, CL PINSRB $0x00, CX, X10 VPBROADCASTB X10, Y10 XORQ CX, CX XORQ BX, BX VPXOR Y9, Y9, Y9 VPERMQ $0x44, (SI), Y8 VPERMQ $0x44, 16(SI), Y6 VMOVDQA b64_dec_lut_hi<>+0(SB), Y7 loop: VMOVDQU (DX)(BX*1), Y2 VPCMPEQB Y2, Y1, Y4 VPBLENDVB Y4, Y0, Y2, Y2 VPSRLD $0x04, Y2, Y4 VPAND Y10, Y2, Y5 VPSHUFB Y5, Y6, Y5 VPAND Y10, Y4, Y4 VPSHUFB Y4, Y7, Y11 VPTEST Y11, Y5 JNE done VPCMPEQB Y10, Y2, Y5 VPADDB Y5, Y4, Y4 VPSHUFB Y4, Y8, Y4 VPADDB Y2, Y4, Y2 VPMADDUBSW b64_dec_madd1<>+0(SB), Y2, Y2 VPMADDWD b64_dec_madd2<>+0(SB), Y2, Y2 VEXTRACTI128 $0x01, Y2, X3 VPSHUFB b64_dec_shuf_lo<>+0(SB), X3, X3 VPSHUFB b64_dec_shuf<>+0(SB), Y2, Y2 VPBLENDD $0x08, Y3, Y2, Y3 VPBLENDD $0xc0, Y9, Y3, Y3 VMOVDQU Y3, (AX)(CX*1) ADDQ $0x18, CX ADDQ $0x20, BX SUBQ $0x20, DI CMPQ DI, $0x2d JB done JMP loop done: MOVQ CX, ret+56(FP) MOVQ BX, ret1+64(FP) VZEROUPPER RET golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/decode_arm64.go000066400000000000000000000002671452252572700255040ustar00rootroot00000000000000//go:build !purego // +build !purego package base64 func decodeARM64(dst []byte, src []byte, lut *int8) (int, int) func decodeStdARM64(dst []byte, src []byte, lut *int8) (int, int) golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/decode_arm64.s000066400000000000000000000222161452252572700253370ustar00rootroot00000000000000#include "textflag.h" #define LOAD_ARGS() \ MOVD dst_base+0(FP), R0; \ MOVD R0, R3; \ MOVD src_base+24(FP), R1; \ MOVD R1, R4; \ MOVD src_len+32(FP), R2; \ BIC $31, R2, R2; \ ADD R1, R2, R2 #define LOAD_ARG_LUT() \ MOVD lut+48(FP), R5; \ VLD2R (R5), [V0.B16, V1.B16] #define LOAD_CONST_LUT() \ MOVD $·mask_lut(SB), R6; \ MOVD $·bpos_lut(SB), R7; \ MOVD $·shft_lut(SB), R8; \ VLD1 (R6), [V2.B16]; \ VLD1 (R7), [V3.B16]; \ VLD1 (R8), [V4.B16]; \ VMOVI $43, V5.B8; \ VMOVI $47, V6.B8; \ VMOVI $15, V7.B8; \ VMOVI $16, V8.B8; \ #define LOAD_INPUT() \ VLD4 (R4), [V10.B8, V11.B8, V12.B8, V13.B8] #define COMPARE_INPUT(v) \ VCMEQ V10.B8, v.B8, V14.B8; \ VCMEQ V11.B8, v.B8, V15.B8; \ VCMEQ V12.B8, v.B8, V16.B8; \ VCMEQ V13.B8, v.B8, V17.B8 #define UPDATE_INPUT(v) \ VBIT V14.B8, v.B8, V10.B8; \ VBIT V15.B8, v.B8, V11.B8; \ VBIT V16.B8, v.B8, V12.B8; \ VBIT V17.B8, v.B8, V13.B8 #define DECODE_INPUT(goto_err) \ /* Create hi/lo nibles */ \ VUSHR $4, V10.B8, V18.B8; \ VUSHR $4, V11.B8, V19.B8; \ VUSHR $4, V12.B8, V20.B8; \ VUSHR $4, V13.B8, V21.B8; \ VAND V7.B8, V10.B8, V22.B8; \ VAND V7.B8, V11.B8, V23.B8; \ VAND V7.B8, V12.B8, V24.B8; \ VAND V7.B8, V13.B8, V25.B8; \ /* Detect invalid input characters */ \ VTBL V22.B8, [V2.B8], V22.B8; \ VTBL V23.B8, [V2.B8], V23.B8; \ VTBL V24.B8, [V2.B8], V24.B8; \ VTBL V25.B8, [V2.B8], V25.B8; \ VTBL V18.B8, [V3.B8], V26.B8; \ VTBL V19.B8, [V3.B8], V27.B8; \ VTBL V20.B8, [V3.B8], V28.B8; \ VTBL V21.B8, [V3.B8], V29.B8; \ VAND V22.B8, V26.B8, V26.B8; \ VAND V23.B8, V27.B8, V27.B8; \ VAND V24.B8, V28.B8, V28.B8; \ VAND V25.B8, V29.B8, V29.B8; \ WORD $0x0e209b5a /* VCMEQ $0, V26.B8, V26.B8 */; \ WORD $0x0e209b7b /* VCMEQ $0, V27.B8, V27.B8 */; \ WORD $0x0e209b9c /* VCMEQ $0, V28.B8, V28.B8 */; \ WORD $0x0e209bbd /* VCMEQ $0, V29.B8, V29.B8 */; \ VORR V26.B8, V27.B8, V26.B8; \ VORR V28.B8, V29.B8, V28.B8; \ VORR V26.B8, V28.B8, V26.B8; \ VMOV V26.D[0], R5; \ VMOV V26.D[1], R6; \ ORR R6, R5; \ CBNZ R5, goto_err; \ /* Shift hi nibles */ \ VTBL V18.B8, [V4.B8], V18.B8; \ VTBL V19.B8, [V4.B8], V19.B8; \ VTBL V20.B8, [V4.B8], V20.B8; \ VTBL V21.B8, [V4.B8], V21.B8; \ VBIT V14.B8, V8.B8, V18.B8; \ VBIT V15.B8, V8.B8, V19.B8; \ VBIT V16.B8, V8.B8, V20.B8; \ VBIT V17.B8, V8.B8, V21.B8; \ /* Combine results */ \ VADD V18.B8, V10.B8, V10.B8; \ VADD V19.B8, V11.B8, V11.B8; \ VADD V20.B8, V12.B8, V12.B8; \ VADD V21.B8, V13.B8, V13.B8; \ VUSHR $4, V11.B8, V14.B8; \ VUSHR $2, V12.B8, V15.B8; \ VSHL $2, V10.B8, V10.B8; \ VSHL $4, V11.B8, V11.B8; \ VSHL $6, V12.B8, V12.B8; \ VORR V10.B8, V14.B8, V16.B8; \ VORR V11.B8, V15.B8, V17.B8; \ VORR V12.B8, V13.B8, V18.B8 #define ADVANCE_LOOP(goto_loop) \ VST3.P [V16.B8, V17.B8, V18.B8], 24(R3); \ ADD $32, R4; \ CMP R4, R2; \ BGT goto_loop #define RETURN() \ SUB R0, R3; \ SUB R1, R4; \ MOVD R3, ret+56(FP); \ MOVD R4, ret1+64(FP); \ RET // func decodeARM64(dst []byte, src []byte, lut *int8) (int, int) TEXT ·decodeARM64(SB),NOSPLIT,$0-72 LOAD_ARGS() LOAD_ARG_LUT() LOAD_CONST_LUT() loop: LOAD_INPUT() // Compare and normalize the 63rd and 64th characters COMPARE_INPUT(V0) UPDATE_INPUT(V5) COMPARE_INPUT(V1) UPDATE_INPUT(V6) DECODE_INPUT(done) // Detect invalid input characters ADVANCE_LOOP(loop) // Store results and continue done: // RETURN() replacing the macro to please go vet. SUB R0, R3; SUB R1, R4; MOVD R3, ret+56(FP); MOVD R4, ret1+64(FP); RET // func decodeStdARM64(dst []byte, src []byte, lut *int8) (int, int) TEXT ·decodeStdARM64(SB),NOSPLIT,$0-72 LOAD_ARGS() LOAD_CONST_LUT() loop: LOAD_INPUT() COMPARE_INPUT(V6) // Compare to '+' DECODE_INPUT(done) // Detect invalid input characters ADVANCE_LOOP(loop) // Store results and continue done: // RETURN() replacing the macro to please go vet. SUB R0, R3; SUB R1, R4; MOVD R3, ret+56(FP); MOVD R4, ret1+64(FP); RET DATA ·mask_lut+0x00(SB)/1, $0xa8 DATA ·mask_lut+0x01(SB)/1, $0xf8 DATA ·mask_lut+0x02(SB)/1, $0xf8 DATA ·mask_lut+0x03(SB)/1, $0xf8 DATA ·mask_lut+0x04(SB)/1, $0xf8 DATA ·mask_lut+0x05(SB)/1, $0xf8 DATA ·mask_lut+0x06(SB)/1, $0xf8 DATA ·mask_lut+0x07(SB)/1, $0xf8 DATA ·mask_lut+0x08(SB)/1, $0xf8 DATA ·mask_lut+0x09(SB)/1, $0xf8 DATA ·mask_lut+0x0a(SB)/1, $0xf0 DATA ·mask_lut+0x0b(SB)/1, $0x54 DATA ·mask_lut+0x0c(SB)/1, $0x50 DATA ·mask_lut+0x0d(SB)/1, $0x50 DATA ·mask_lut+0x0e(SB)/1, $0x50 DATA ·mask_lut+0x0f(SB)/1, $0x54 GLOBL ·mask_lut(SB), NOPTR|RODATA, $16 DATA ·bpos_lut+0x00(SB)/1, $0x01 DATA ·bpos_lut+0x01(SB)/1, $0x02 DATA ·bpos_lut+0x02(SB)/1, $0x04 DATA ·bpos_lut+0x03(SB)/1, $0x08 DATA ·bpos_lut+0x04(SB)/1, $0x10 DATA ·bpos_lut+0x05(SB)/1, $0x20 DATA ·bpos_lut+0x06(SB)/1, $0x40 DATA ·bpos_lut+0x07(SB)/1, $0x80 DATA ·bpos_lut+0x08(SB)/1, $0x00 DATA ·bpos_lut+0x09(SB)/1, $0x00 DATA ·bpos_lut+0x0a(SB)/1, $0x00 DATA ·bpos_lut+0x0b(SB)/1, $0x00 DATA ·bpos_lut+0x0c(SB)/1, $0x00 DATA ·bpos_lut+0x0d(SB)/1, $0x00 DATA ·bpos_lut+0x0e(SB)/1, $0x00 DATA ·bpos_lut+0x0f(SB)/1, $0x00 GLOBL ·bpos_lut(SB), NOPTR|RODATA, $16 DATA ·shft_lut+0x00(SB)/1, $0x00 DATA ·shft_lut+0x01(SB)/1, $0x00 DATA ·shft_lut+0x02(SB)/1, $0x13 DATA ·shft_lut+0x03(SB)/1, $0x04 DATA ·shft_lut+0x04(SB)/1, $0xbf DATA ·shft_lut+0x05(SB)/1, $0xbf DATA ·shft_lut+0x06(SB)/1, $0xb9 DATA ·shft_lut+0x07(SB)/1, $0xb9 DATA ·shft_lut+0x08(SB)/1, $0x00 DATA ·shft_lut+0x09(SB)/1, $0x00 DATA ·shft_lut+0x0a(SB)/1, $0x00 DATA ·shft_lut+0x0b(SB)/1, $0x00 DATA ·shft_lut+0x0c(SB)/1, $0x00 DATA ·shft_lut+0x0d(SB)/1, $0x00 DATA ·shft_lut+0x0e(SB)/1, $0x00 DATA ·shft_lut+0x0f(SB)/1, $0x00 GLOBL ·shft_lut(SB), NOPTR|RODATA, $16 golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/encode_amd64.go000066400000000000000000000003571452252572700255000ustar00rootroot00000000000000// Code generated by command: go run encode_asm.go -pkg base64 -out ../base64/encode_amd64.s -stubs ../base64/encode_amd64.go. DO NOT EDIT. //go:build !purego package base64 func encodeAVX2(dst []byte, src []byte, lut *int8) (int, int) golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/encode_amd64.s000066400000000000000000000051601452252572700253320ustar00rootroot00000000000000// Code generated by command: go run encode_asm.go -pkg base64 -out ../base64/encode_amd64.s -stubs ../base64/encode_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func encodeAVX2(dst []byte, src []byte, lut *int8) (int, int) // Requires: AVX, AVX2, SSE4.1 TEXT ·encodeAVX2(SB), NOSPLIT, $0-72 MOVQ dst_base+0(FP), AX MOVQ src_base+24(FP), DX MOVQ lut+48(FP), SI MOVQ src_len+32(FP), DI MOVB $0x33, CL PINSRB $0x00, CX, X4 VPBROADCASTB X4, Y4 MOVB $0x19, CL PINSRB $0x00, CX, X5 VPBROADCASTB X5, Y5 XORQ CX, CX XORQ BX, BX // Load the 16-byte LUT into both lanes of the register VPERMQ $0x44, (SI), Y3 // Load the first block using a mask to avoid potential fault VMOVDQU b64_enc_load<>+0(SB), Y0 VPMASKMOVD -4(DX)(BX*1), Y0, Y0 loop: VPSHUFB b64_enc_shuf<>+0(SB), Y0, Y0 VPAND b64_enc_mask1<>+0(SB), Y0, Y1 VPSLLW $0x08, Y1, Y2 VPSLLW $0x04, Y1, Y1 VPBLENDW $0xaa, Y2, Y1, Y2 VPAND b64_enc_mask2<>+0(SB), Y0, Y1 VPMULHUW b64_enc_mult<>+0(SB), Y1, Y0 VPOR Y0, Y2, Y0 VPSUBUSB Y4, Y0, Y1 VPCMPGTB Y5, Y0, Y2 VPSUBB Y2, Y1, Y1 VPSHUFB Y1, Y3, Y1 VPADDB Y0, Y1, Y0 VMOVDQU Y0, (AX)(CX*1) ADDQ $0x20, CX ADDQ $0x18, BX SUBQ $0x18, DI CMPQ DI, $0x20 JB done VMOVDQU -4(DX)(BX*1), Y0 JMP loop done: MOVQ CX, ret+56(FP) MOVQ BX, ret1+64(FP) VZEROUPPER RET DATA b64_enc_load<>+0(SB)/8, $0x8000000000000000 DATA b64_enc_load<>+8(SB)/8, $0x8000000080000000 DATA b64_enc_load<>+16(SB)/8, $0x8000000080000000 DATA b64_enc_load<>+24(SB)/8, $0x8000000080000000 GLOBL b64_enc_load<>(SB), RODATA|NOPTR, $32 DATA b64_enc_shuf<>+0(SB)/8, $0x0809070805060405 DATA b64_enc_shuf<>+8(SB)/8, $0x0e0f0d0e0b0c0a0b DATA b64_enc_shuf<>+16(SB)/8, $0x0405030401020001 DATA b64_enc_shuf<>+24(SB)/8, $0x0a0b090a07080607 GLOBL b64_enc_shuf<>(SB), RODATA|NOPTR, $32 DATA b64_enc_mask1<>+0(SB)/8, $0x003f03f0003f03f0 DATA b64_enc_mask1<>+8(SB)/8, $0x003f03f0003f03f0 DATA b64_enc_mask1<>+16(SB)/8, $0x003f03f0003f03f0 DATA b64_enc_mask1<>+24(SB)/8, $0x003f03f0003f03f0 GLOBL b64_enc_mask1<>(SB), RODATA|NOPTR, $32 DATA b64_enc_mask2<>+0(SB)/8, $0x0fc0fc000fc0fc00 DATA b64_enc_mask2<>+8(SB)/8, $0x0fc0fc000fc0fc00 DATA b64_enc_mask2<>+16(SB)/8, $0x0fc0fc000fc0fc00 DATA b64_enc_mask2<>+24(SB)/8, $0x0fc0fc000fc0fc00 GLOBL b64_enc_mask2<>(SB), RODATA|NOPTR, $32 DATA b64_enc_mult<>+0(SB)/8, $0x0400004004000040 DATA b64_enc_mult<>+8(SB)/8, $0x0400004004000040 DATA b64_enc_mult<>+16(SB)/8, $0x0400004004000040 DATA b64_enc_mult<>+24(SB)/8, $0x0400004004000040 GLOBL b64_enc_mult<>(SB), RODATA|NOPTR, $32 golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/encode_arm64.go000066400000000000000000000001651452252572700255130ustar00rootroot00000000000000//go:build !purego // +build !purego package base64 func encodeARM64(dst []byte, src []byte, lut *int8) (int, int) golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/base64/encode_arm64.s000066400000000000000000000051111452252572700253440ustar00rootroot00000000000000#include "textflag.h" #define Rdst R0 #define Rsrc R1 #define Rlen R2 #define Rwr R3 #define Rrem R4 #define Rtmp R5 #define Vlut V0 #define Vfld0 V6 #define Vfld1 V7 #define Vfld2 V8 #define Vfld3 V9 #define Vsrc0 V10 #define Vsrc1 V11 #define Vsrc2 V12 #define Vr0a V13 #define Vr1a V14 #define Vr2a V15 #define Vr3a V16 #define Vr0b V17 #define Vr1b V18 #define Vr2b V19 #define Vr3b V20 // func encodeARM64(dst []byte, src []byte, lut *int8) (int, int) TEXT ·encodeARM64(SB),NOSPLIT,$0-72 // Load dst/src info MOVD dst_base+0(FP), Rdst MOVD src_base+24(FP), Rsrc MOVD src_len+32(FP), Rlen MOVD lut+48(FP), Rtmp VLD1 (Rtmp), [Vlut.B16] MOVD Rlen, Rrem MOVD Rdst, Rwr VMOVI $51, V1.B16 VMOVI $26, V2.B16 VMOVI $63, V3.B16 VMOVI $13, V4.B16 loop: VLD3.P 48(Rsrc), [Vsrc0.B16, Vsrc1.B16, Vsrc2.B16] // Split 3 source blocks into 4 lookup inputs VUSHR $2, Vsrc0.B16, Vfld0.B16 VUSHR $4, Vsrc1.B16, Vfld1.B16 VUSHR $6, Vsrc2.B16, Vfld2.B16 VSHL $4, Vsrc0.B16, Vsrc0.B16 VSHL $2, Vsrc1.B16, Vsrc1.B16 VORR Vsrc0.B16, Vfld1.B16, Vfld1.B16 VORR Vsrc1.B16, Vfld2.B16, Vfld2.B16 VAND V3.B16, Vfld1.B16, Vfld1.B16 VAND V3.B16, Vfld2.B16, Vfld2.B16 VAND V3.B16, Vsrc2.B16, Vfld3.B16 WORD $0x6e212ccd // VUQSUB V1.B16, Vfld0.B16, Vr0a.B16 WORD $0x4e263451 // VCMGT V2.B16, Vfld0.B16, Vr0b.B16 VAND V4.B16, Vr0b.B16, Vr0b.B16 VORR Vr0b.B16, Vr0a.B16, Vr0a.B16 WORD $0x6e212cee // VUQSUB V1.B16, Vfld1.B16, Vr1a.B16 WORD $0x4e273452 // VCMGT V2.B16, Vfld1.B16, Vr1b.B16 VAND V4.B16, Vr1b.B16, Vr1b.B16 VORR Vr1b.B16, Vr1a.B16, Vr1a.B16 WORD $0x6e212d0f // VUQSUB V1.B16, Vfld2.B16, Vr2a.B16 WORD $0x4e283453 // VCMGT V2.B16, Vfld2.B16, Vr2b.B16 VAND V4.B16, Vr2b.B16, Vr2b.B16 VORR Vr2b.B16, Vr2a.B16, Vr2a.B16 WORD $0x6e212d30 // VUQSUB V1.B16, Vfld3.B16, Vr3a.B16 WORD $0x4e293454 // VCMGT V2.B16, Vfld3.B16, Vr3b.B16 VAND V4.B16, Vr3b.B16, Vr3b.B16 VORR Vr3b.B16, Vr3a.B16, Vr3a.B16 // Add result of lookup table to each field VTBL Vr0a.B16, [Vlut.B16], Vr0a.B16 VADD Vr0a.B16, Vfld0.B16, Vfld0.B16 VTBL Vr1a.B16, [Vlut.B16], Vr1a.B16 VADD Vr1a.B16, Vfld1.B16, Vfld1.B16 VTBL Vr2a.B16, [Vlut.B16], Vr2a.B16 VADD Vr2a.B16, Vfld2.B16, Vfld2.B16 VTBL Vr3a.B16, [Vlut.B16], Vr3a.B16 VADD Vr3a.B16, Vfld3.B16, Vfld3.B16 VST4.P [Vfld0.B16, Vfld1.B16, Vfld2.B16, Vfld3.B16], 64(Rwr) SUB $48, Rrem CMP $48, Rrem BGE loop done: SUB Rdst, Rwr SUB Rrem, Rlen MOVD Rwr, ret+56(FP) MOVD Rlen, ret1+64(FP) RET golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/bswap/000077500000000000000000000000001452252572700227445ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/bswap/swap64.go000066400000000000000000000007551452252572700244260ustar00rootroot00000000000000package bswap import _ "github.com/segmentio/asm/cpu" // Swap64 performs an in-place byte swap on each 64 bits elements in b. // // This function is useful when dealing with big-endian input; by converting it // to little-endian, the data can then be compared using native CPU instructions // instead of having to employ often slower byte comparison algorithms. func Swap64(b []byte) { if len(b)%8 != 0 { panic("swap64 expects the input to contain full 64 bits elements") } swap64(b) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/bswap/swap64_amd64.go000066400000000000000000000004171452252572700254140ustar00rootroot00000000000000// Code generated by command: go run swap64_asm.go -pkg bswap -out ../bswap/swap64_amd64.s -stubs ../bswap/swap64_amd64.go. DO NOT EDIT. //go:build !purego package bswap // swap64 performs an in-place byte swap on each qword of the input buffer. func swap64(b []byte) golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/bswap/swap64_amd64.s000066400000000000000000000027201452252572700252500ustar00rootroot00000000000000// Code generated by command: go run swap64_asm.go -pkg bswap -out ../bswap/swap64_amd64.s -stubs ../bswap/swap64_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func swap64(b []byte) // Requires: AVX, AVX2 TEXT ·swap64(SB), NOSPLIT, $0-24 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX MOVQ AX, DX ADDQ CX, DX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCC x86_loop VMOVDQU shuffle_mask<>+0(SB), Y0 avx2_loop: MOVQ AX, CX ADDQ $0x80, CX CMPQ CX, DX JAE x86_loop VMOVDQU (AX), Y1 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y3 VMOVDQU 96(AX), Y4 VPSHUFB Y0, Y1, Y1 VPSHUFB Y0, Y2, Y2 VPSHUFB Y0, Y3, Y3 VPSHUFB Y0, Y4, Y4 VMOVDQU Y1, (AX) VMOVDQU Y2, 32(AX) VMOVDQU Y3, 64(AX) VMOVDQU Y4, 96(AX) MOVQ CX, AX JMP avx2_loop x86_loop: MOVQ AX, CX ADDQ $0x20, CX CMPQ CX, DX JAE slow_loop MOVQ (AX), BX MOVQ 8(AX), SI MOVQ 16(AX), DI MOVQ 24(AX), R8 BSWAPQ BX BSWAPQ SI BSWAPQ DI BSWAPQ R8 MOVQ BX, (AX) MOVQ SI, 8(AX) MOVQ DI, 16(AX) MOVQ R8, 24(AX) MOVQ CX, AX JMP x86_loop slow_loop: CMPQ AX, DX JAE done MOVQ (AX), CX BSWAPQ CX MOVQ CX, (AX) ADDQ $0x08, AX JMP slow_loop done: RET DATA shuffle_mask<>+0(SB)/8, $0x0001020304050607 DATA shuffle_mask<>+8(SB)/8, $0x08090a0b0c0d0e0f DATA shuffle_mask<>+16(SB)/8, $0x0001020304050607 DATA shuffle_mask<>+24(SB)/8, $0x08090a0b0c0d0e0f GLOBL shuffle_mask<>(SB), RODATA|NOPTR, $32 golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/bswap/swap64_default.go000066400000000000000000000003551452252572700261260ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package bswap import "encoding/binary" func swap64(b []byte) { for i := 0; i < len(b); i += 8 { u := binary.BigEndian.Uint64(b[i:]) binary.LittleEndian.PutUint64(b[i:], u) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/bswap/swap64_test.go000066400000000000000000000013721452252572700254610ustar00rootroot00000000000000package bswap import ( "encoding/binary" "io" "math/rand" "testing" ) func TestSwap64(t *testing.T) { input := make([]byte, 4096) prng := rand.New(rand.NewSource(0)) io.ReadFull(prng, input) output := make([]byte, 4096) for i := 0; i < 4096; i += 8 { copy(output, input) Swap64(output[:i]) for j := 0; j < i; j += 8 { u1 := binary.BigEndian.Uint64(input[j:]) u2 := binary.LittleEndian.Uint64(output[j:]) if u1 != u2 { t.Fatalf("bytes weren't swapped at offset %d: %v / %v", i, u1, u2) } } } } func BenchmarkSwap64(b *testing.B) { input := make([]byte, 64*1024) prng := rand.New(rand.NewSource(0)) io.ReadFull(prng, input) b.SetBytes(int64(len(input))) b.ResetTimer() for i := 0; i < b.N; i++ { Swap64(input) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/000077500000000000000000000000001452252572700227275ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/ascii/000077500000000000000000000000001452252572700240175ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/ascii/equal_fold_asm.go000066400000000000000000000127761452252572700273360ustar00rootroot00000000000000// +build ignore package main import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" . "github.com/segmentio/asm/build/internal/x86" "fmt" "github.com/segmentio/asm/cpu" ) func init() { ConstraintExpr("!purego") } func main() { TEXT("EqualFoldString", NOSPLIT, "func(a, b string) bool") Doc( "EqualFoldString is a version of strings.EqualFold designed to work on ASCII", "input instead of UTF-8.", "", "When the program has guarantees that the input is composed of ASCII", "characters only, it allows for greater optimizations.", ) // Use index for byte position. We have plenty of registers, and it saves an // ADD operation as the memory index is the same for both a and b. i := GP64() a := Mem{Base: Load(Param("a").Base(), GP64()), Index: i, Scale: 1} n := Load(Param("a").Len(), GP64()) b := Mem{Base: Load(Param("b").Base(), GP64()), Index: i, Scale: 1} bn, _ := Param("b").Len().Resolve() ret, _ := ReturnIndex(0).Resolve() CMPQ(n, bn.Addr) // if len(a) != len(b): JNE(LabelRef("done")) // return false XORQ(i, i) // i = 0 CMPQ(n, U8(16)) // if n < 16: JB(LabelRef("init_x86")) // goto init_x86 JumpIfFeature("init_avx", cpu.AVX2) // goto init_avx if supported Label("init_x86") cmp := GP32() av := GP32() bv := GP32() // Map to convert ASCII upper characters to lower case. lowerCase := Mem{Base: GP64(), Scale: 1} LEAQ(NewDataAddr(Symbol{Name: "github·com∕segmentio∕asm∕ascii·lowerCase"}, 0), lowerCase.Base) XORL(cmp, cmp) Label("cmp8") CMPQ(n, U8(8)) // if n < 0: JB(LabelRef("cmp7")) // goto cmp7 for i := 0; i < 8; i++ { MOVBLZX(a.Offset(i), av) // av = a[i] MOVBLZX(b.Offset(i), bv) // bv = b[i] MOVB(lowerCase.Idx(av, 1), av.As8()) // av = lowerCase[av] XORB(lowerCase.Idx(bv, 1), av.As8()) // av = lowerCase[bv] ^ av ORB(av.As8(), cmp.As8()) // cmp |= av } JNE(LabelRef("done")) // return false if ZF == 0 ADDQ(Imm(8), a.Index) // i += 8 SUBQ(Imm(8), n) // n -= 8 JMP(LabelRef("cmp8")) for i := 6; i >= 0; i-- { Label(fmt.Sprintf("cmp%d", i+1)) next := "success" if i > 0 { next = fmt.Sprintf("cmp%d", i) } CMPQ(n, U8(i+1)) // if n < i: JB(LabelRef(next)) // goto cmp${i-1} MOVBLZX(a.Offset(i), av) // av = a[i] MOVBLZX(b.Offset(i), bv) // bv = b[i] MOVB(lowerCase.Idx(av, 1), av.As8()) // av = lowerCase[av] XORB(lowerCase.Idx(bv, 1), av.As8()) // av = lowerCase[bv] ^ av ORB(av.As8(), cmp.As8()) // cmp |= av } Label("done") SETEQ(ret.Addr) // return ZF RET() // ... Label("success") MOVB(U8(1), ret.Addr) // return true RET() // ... Label("init_avx") bit := VecBroadcast(U8(0x20), YMM()) // "case" bit msk := VecBroadcast(U8(0x1F), YMM()) // 0b10000000 - 'a' rng := VecBroadcast(U8(0x9A), YMM()) // 'z' - 'a' + 1 - 0x80 (overflowed 8-bits) one := VecBroadcast(U8(0x01), YMM()) // 1-bit for ANDing with comparison vec := NewVectorizer(12, func(l VectorLane) Register { v0 := l.Read(a) v1 := l.Read(b) v2 := l.Alloc() VXORPD(v0, v1, v1) // calculate difference between a and b VPCMPEQB(bit, v1, v2) // check if above difference is the 6th bit VORPD(bit, v0, v0) // set the 6th bit for a VPADDB(msk, v0, v0) // add 0x1f to each byte to set top bit for letters VPCMPGTB(v0, rng, v0) // compare if not letter: v - 'a' < 'z' - 'a' + 1 VPAND(v2, v0, v0) // combine 6th-bit difference with letter range VPAND(one, v0, v0) // merge test mask VPSLLW(Imm(5), v0, v0) // shift into case bit position VPCMPEQB(v1, v0, v0) // compare original difference with case-only difference return v0 }).Reduce(ReduceAnd) // merge all comparisons together cmpAVX := func(spec Spec, lanes int, incr bool) { sz := int(spec.Size()) tmp := GP32() out := vec.Compile(spec, lanes)[0] // [compare sz*lanes bytes] if incr { ADDQ(U8(sz*lanes), a.Index) // i += sz*lanes SUBQ(U8(sz*lanes), n) // n -= sz*lanes } VPMOVMSKB(out, tmp) // tmp[0,1,2,...] = y0[0,8,16,...] XORL(U32(^uint32(0)>>(32-sz)), tmp) // ZF = (tmp == 0xFFFFFFFF) } Label("cmp128") CMPQ(n, U8(128)) // if n < 128: JB(LabelRef("cmp64")) // goto cmp64 cmpAVX(S256, 4, true) // ZF = [compare 128 bytes] JNE(LabelRef("done")) // return if ZF == 0 JMP(LabelRef("cmp128")) // loop cmp128 Label("cmp64") CMPQ(n, U8(64)) // if n < 64: JB(LabelRef("cmp32")) // goto cmp32 cmpAVX(S256, 2, true) // ZF = [compare 64 bytes] JNE(LabelRef("done")) // return if ZF == 0 Label("cmp32") CMPQ(n, U8(32)) // if n < 32: JB(LabelRef("cmp16")) // goto cmp16 cmpAVX(S256, 1, true) // ZF = [compare 32 bytes] JNE(LabelRef("done")) // return if ZF == 0 Label("cmp16") // Convert YMM masks to XMM bit = bit.(Vec).AsX() msk = msk.(Vec).AsX() rng = rng.(Vec).AsX() one = one.(Vec).AsX() CMPQ(n, U8(16)) // if n <= 16: JLE(LabelRef("cmp_tail")) // goto cmp_tail cmpAVX(S128, 1, true) // ZF = [compare 16 bytes] JNE(LabelRef("done")) // return if ZF == 0 Label("cmp_tail") // At this point, we have <= 16 bytes to compare, but we know the total input // is >= 16 bytes. Move the pointer to the *last* 16 bytes of the input so we // can skip the fallback. SUBQ(Imm(16), n) // n -= 16 ADDQ(n, a.Index) // i += n cmpAVX(S128, 1, false) // ZF = [compare 16 bytes] JMP(LabelRef("done")) // return ZF Generate() } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/ascii/valid_asm.go000066400000000000000000000106741452252572700263150ustar00rootroot00000000000000//go:build ignore // +build ignore package main import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" . "github.com/segmentio/asm/build/internal/x86" "github.com/segmentio/asm/cpu" ) func init() { ConstraintExpr("!purego") } func main() { TEXT("ValidString", NOSPLIT, "func(s string) bool") Doc("ValidString returns true if s contains only ASCII characters.") p := Mem{Base: Load(Param("s").Base(), GP64())} n := Load(Param("s").Len(), GP64()) ret, _ := ReturnIndex(0).Resolve() v := GP32() vl := GP32() maskG := GP64() MOVQ(U64(0x8080808080808080), maskG) // maskG = 0x8080808080808080 CMPQ(n, U8(16)) // if n < 16: JB(LabelRef("cmp8")) // goto cmp8 JumpIfFeature("init_avx", cpu.AVX2) Label("cmp8") CMPQ(n, U8(8)) // if n < 8: JB(LabelRef("cmp4")) // goto cmp4 TESTQ(maskG, p) // if (p[0:8] & 0x8080808080808080) != 0: JNZ(LabelRef("invalid")) // return false ADDQ(U8(8), p.Base) // p += 8 SUBQ(U8(8), n) // n -= 8 JMP(LabelRef("cmp8")) // loop cmp8 Label("cmp4") CMPQ(n, U8(4)) // if n < 4: JB(LabelRef("cmp3")) // goto cmp3 TESTL(U32(0x80808080), p) // if (p[0:4] & 0x80808080) != 0: JNZ(LabelRef("invalid")) // return false ADDQ(U8(4), p.Base) // p += 4 SUBQ(U8(4), n) // n -= 4 Label("cmp3") CMPQ(n, U8(3)) // if n < 3: JB(LabelRef("cmp2")) // goto cmp2 MOVWLZX(p, vl) // vl = p[i:i+2] MOVBLZX(p.Offset(2), v) // v = p[i+2:i+3] SHLL(U8(16), v) // v <<= 16 ORL(vl, v) // v = vl | v TESTL(U32(0x80808080), v) // ZF = (v & 0x80808080) == 0 JMP(LabelRef("done")) // return ZF Label("cmp2") CMPQ(n, U8(2)) // if n < 2: JB(LabelRef("cmp1")) // goto cmp1 TESTW(U16(0x8080), p) // ZF = (p[0:2] & 0x8080) == 0 JMP(LabelRef("done")) // return ZF Label("cmp1") CMPQ(n, U8(0)) // if n == 0: JE(LabelRef("done")) // return true TESTB(U8(0x80), p) // ZF = (p[0:1] & 0x80) == 0 Label("done") SETEQ(ret.Addr) // return ZF RET() // ... Label("invalid") MOVB(U8(0), ret.Addr) RET() Label("init_avx") maskY := VecBroadcast(maskG, YMM()) maskX := maskY.(Vec).AsX() vec := NewVectorizer(15, func(l VectorLane) Register { r := l.Alloc() VMOVDQU(l.Offset(p), r) VPOR(l.Offset(p), r, r) return r }).Reduce(ReduceOr) Label("cmp256") CMPQ(n, U32(256)) // if n < 256: JB(LabelRef("cmp128")) // goto cmp128 VPTEST(vec.Compile(S256, 4)[0], maskY) // if (OR & maskY) != 0: JNZ(LabelRef("invalid")) // return false ADDQ(U32(256), p.Base) // p += 256 SUBQ(U32(256), n) // n -= 256 JMP(LabelRef("cmp256")) // loop cmp256 Label("cmp128") CMPQ(n, U8(128)) // if n < 128: JB(LabelRef("cmp64")) // goto cmp64 VPTEST(vec.Compile(S256, 2)[0], maskY) // if (OR & maskY) != 0: JNZ(LabelRef("invalid")) // return false ADDQ(U8(128), p.Base) // p += 128 SUBQ(U8(128), n) // n -= 128 JMP(LabelRef("cmp64")) // goto cmp64 Label("cmp64") CMPQ(n, U8(64)) // if n < 64: JB(LabelRef("cmp32")) // goto cmp32 VPTEST(vec.Compile(S256, 1)[0], maskY) // if (OR & maskY) != 0: JNZ(LabelRef("invalid")) // return false ADDQ(U8(64), p.Base) // p += 64 SUBQ(U8(64), n) // n -= 64 Label("cmp32") CMPQ(n, U8(32)) // if n < 32: JB(LabelRef("cmp16")) // goto cmp16 VPTEST(p, maskY) // if (p[0:32] & maskY) != 0: JNZ(LabelRef("invalid")) // return false ADDQ(U8(32), p.Base) // p += 32 SUBQ(U8(32), n) // n -= 32 Label("cmp16") CMPQ(n, U8(16)) // if n <= 16: JLE(LabelRef("cmp_tail")) // goto cmp_tail VPTEST(p, maskX) // if (p[0:16] & maskX) != 0: JNZ(LabelRef("invalid")) // return false ADDQ(U8(16), p.Base) // p += 16 SUBQ(U8(16), n) // n -= 16 Label("cmp_tail") // At this point, we have <= 16 bytes to compare, but we know the total input // is >= 16 bytes. Move the pointer to the *last* 16 bytes of the input so we // can skip the fallback. SUBQ(Imm(16), n) // n -= 16 ADDQ(n, p.Base) // p += n VPTEST(p, maskX) // ZF = (p[0:16] & maskX) == 0 JMP(LabelRef("done")) // return ZF Generate() } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/ascii/valid_print_asm.go000066400000000000000000000142121452252572700275210ustar00rootroot00000000000000// +build ignore package main import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" . "github.com/segmentio/asm/build/internal/x86" "github.com/segmentio/asm/cpu" ) func init() { ConstraintExpr("!purego") } func main() { TEXT("ValidPrintString", NOSPLIT, "func(s string) bool") Doc("ValidPrintString returns true if s contains only printable ASCII characters.") p := Mem{Base: Load(Param("s").Base(), GP64())} n := Load(Param("s").Len(), GP64()) ret, _ := ReturnIndex(0).Resolve() m1 := GP64() m2 := GP64() m3 := GP64() val := GP32() tmp := GP32() CMPQ(n, U8(16)) // if n < 16: JB(LabelRef("init_x86")) // goto init_x86 JumpIfFeature("init_avx", cpu.AVX2) // goto init_avx if supported Label("init_x86") CMPQ(n, U8(8)) // if n < 8: JB(LabelRef("cmp4")) // goto cmp4 MOVQ(U64(0xDFDFDFDFDFDFDFE0), m1) MOVQ(U64(0x0101010101010101), m2) MOVQ(U64(0x8080808080808080), m3) Label("cmp8") valid8(p, n, m1, m2, m3) // ZF = [compare 8 bytes] JNE(LabelRef("done")) // return ZF if ZF == 0 CMPQ(n, U8(8)) // if n < 8: JB(LabelRef("cmp4")) // goto cmp4 JMP(LabelRef("cmp8")) // loop cmp8 Label("cmp4") CMPQ(n, U8(4)) // if n < 4: JB(LabelRef("cmp3")) // goto cmp3 valid4(p, n) // ZF = [compare 4 bytes] JNE(LabelRef("done")) // return ZF if ZF == 0 Label("cmp3") CMPQ(n, U8(3)) // if n < 3: JB(LabelRef("cmp2")) // goto cmp2 MOVWLZX(p, tmp) // tmp = p[0:2] MOVBLZX(p.Offset(2), val) // val = p[2:3] SHLL(U8(16), val) // val <<= 16 ORL(tmp, val) // val = tmp | val ORL(U32(0x20000000), val) // val = 0x20000000 | val JMP(LabelRef("final")) Label("cmp2") CMPQ(n, U8(2)) // if n < 2: JB(LabelRef("cmp1")) // goto cmp1 MOVWLZX(p, val) // val = p[0:2] ORL(U32(0x20200000), val) // val = 0x20200000 | val JMP(LabelRef("final")) Label("cmp1") CMPQ(n, U8(0)) // if n == 0: JE(LabelRef("done")) // return true MOVBLZX(p, val) // val = p[0:1] ORL(U32(0x20202000), val) // val = 0x20202000 | val Label("final") setup4(val) // [update val register] TESTL(U32(0x80808080), val) // ZF = (0x80808080 & val) == 0 Label("done") SETEQ(ret.Addr) // return ZF RET() // ... Label("init_avx") min := VecBroadcast(U8(0x1F), YMM()) max := VecBroadcast(U8(0x7E), YMM()) vec := NewVectorizer(14, func(l VectorLane) Register { v0 := l.Read(p) v1 := l.Alloc() VPCMPGTB(min, v0, v1) // v1 = bytes that are greater than the min-1 (i.e. valid at lower end) VPCMPGTB(max, v0, v0) // v0 = bytes that are greater than the max (i.e. invalid at upper end) VPANDN(v1, v0, v0) // y2 & ~y3 mask should be full unless there's an invalid byte return v0 }).Reduce(ReduceAnd) // merge all comparisons together cmpAVX := func(spec Spec, lanes int, incr bool) { sz := int(spec.Size()) out := vec.Compile(spec, lanes)[0] // [compare sz*lanes bytes] if incr { ADDQ(U8(sz*lanes), p.Base) // p += sz*lanes SUBQ(U8(sz*lanes), n) // n -= sz*lanes } VPMOVMSKB(out, tmp) // tmp[0,1,2,...] = y0[0,8,16,...] XORL(U32(^uint32(0)>>(32-sz)), tmp) // ZF = (tmp == 0xFFFFFFFF) } Label("cmp128") CMPQ(n, U8(128)) // if n < 128: JB(LabelRef("cmp64")) // goto cmp64 cmpAVX(S256, 4, true) // ZF = [compare 128 bytes] JNE(LabelRef("done")) // return if ZF == 0 JMP(LabelRef("cmp128")) // loop cmp128 Label("cmp64") CMPQ(n, U8(64)) // if n < 64: JB(LabelRef("cmp32")) // goto cmp32 cmpAVX(S256, 2, true) // ZF = [compare 64 bytes] JNE(LabelRef("done")) // return if ZF == 0 Label("cmp32") CMPQ(n, U8(32)) // if n < 32: JB(LabelRef("cmp16")) // goto cmp16 cmpAVX(S256, 1, true) // ZF = [compare 32 bytes] JNE(LabelRef("done")) // return if ZF == 0 Label("cmp16") // Convert YMM masks to XMM min = min.(Vec).AsX() max = max.(Vec).AsX() CMPQ(n, U8(16)) // if n <= 16: JLE(LabelRef("cmp_tail")) // goto cmp_tail cmpAVX(S128, 1, true) // ZF = [compare 16 bytes] JNE(LabelRef("done")) // return if ZF == 0 Label("cmp_tail") // At this point, we have <= 16 bytes to compare, but we know the total input // is >= 16 bytes. Move the pointer to the *last* 16 bytes of the input so we // can skip the fallback. SUBQ(Imm(16), n) // n -= 16 ADDQ(n, p.Base) // p += n cmpAVX(S128, 1, false) // ZF = [compare 16 bytes] JMP(LabelRef("done")) // return ZF Generate() } func valid4(p Mem, n Register) { val := GP32() MOVL(p, val) // val = p[0:4] setup4(val) // [update val register] ADDQ(U8(4), p.Base) // p += 4 SUBQ(U8(4), n) // n -= 4 TESTL(U32(0x80808080), val) // ZF = (0x80808080 & val) == 0 } func setup4(val Register) { nval := GP32() tmp1 := GP32() tmp2 := GP32() MOVL(val, nval) // nval = val LEAL(Mem{Disp: 0xDFDFDFE0, Base: val}, tmp1) // tmp1 = val + 0xDFDFDFE0 NOTL(nval) // nval = ^nval ANDL(nval, tmp1) // tmp1 = nval & tmp1 LEAL(Mem{Disp: 0x01010101, Base: val}, tmp2) // tmp2 = val + 0x01010101 ORL(tmp2, val) // val = val | tmp2 ORL(tmp1, val) // val = val | tmp1 } func valid8(p Mem, n, m1, m2, m3 Register) { val := GP64() nval := GP64() tmp1 := GP64() tmp2 := GP64() MOVQ(p, val) // val = p[0:8] MOVQ(val, nval) // nval = val LEAQ(Mem{Base: val, Index: m1, Scale: 1}, tmp1) // tmp1 = val + m1 NOTQ(nval) // nval = ^nval ANDQ(nval, tmp1) // tmp1 = nval & tmp1 LEAQ(Mem{Base: val, Index: m2, Scale: 1}, tmp2) // tmp2 = val + m2 ORQ(tmp2, val) // val = val | tmp2 ORQ(tmp1, val) // val = val | tmp1 ADDQ(U8(8), p.Base) // p += 8 SUBQ(U8(8), n) // n -= 8 TESTQ(m3, val) // ZF = (m3 & val) == 0 } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/base64/000077500000000000000000000000001452252572700240135ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/base64/decode_asm.go000066400000000000000000000065411452252572700264330ustar00rootroot00000000000000// +build ignore // // This code is a go assembly implementation of: // // Muła, Wojciech, & Lemire, Daniel (Thu, 14 Jun 2018). // Faster Base64 Encoding and Decoding Using AVX2 Instructions. // [arXiv:1704.00605](https://arxiv.org/abs/1704.00605) // // ...with changes to support multiple encodings. package main import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/gotypes" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" . "github.com/segmentio/asm/build/internal/asm" . "github.com/segmentio/asm/build/internal/x86" ) var lutHi = ConstBytes("b64_dec_lut_hi", []byte{ 16, 16, 1, 2, 4, 8, 4, 8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1, 2, 4, 8, 4, 8, 16, 16, 16, 16, 16, 16, 16, 16, }) var madd1 = ConstBytes("b64_dec_madd1", []byte{ 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, }) var madd2 = ConstArray16("b64_dec_madd2", 4096, 1, 4096, 1, 4096, 1, 4096, 1, 4096, 1, 4096, 1, 4096, 1, 4096, 1, ) var shufLo = ConstBytes("b64_dec_shuf_lo", []byte{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 6, }) var shuf = ConstBytes("b64_dec_shuf", []byte{ 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0, 0, 0, 0, 5, 4, 10, 9, 8, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, }) func init() { ConstraintExpr("!purego") } func main() { TEXT("decodeAVX2", NOSPLIT, "func(dst, src []byte, lut *int8) (int, int)") createDecode(Param("dst"), Param("src"), Param("lut"), func(m Mem, r VecVirtual) { VMOVDQU(m, r) }) TEXT("decodeAVX2URI", NOSPLIT, "func(dst, src []byte, lut *int8) (int, int)") slash := VecBroadcast(U8('/'), YMM()) underscore := VecBroadcast(U8('_'), YMM()) createDecode(Param("dst"), Param("src"), Param("lut"), func(m Mem, r VecVirtual) { eq := YMM() VMOVDQU(m, r) VPCMPEQB(r, underscore, eq) VPBLENDVB(eq, slash, r, r) }) Generate() } func createDecode(pdst, psrc, plut Component, load func(m Mem, r VecVirtual)) { dst := Mem{Base: Load(pdst.Base(), GP64()), Index: GP64(), Scale: 1} src := Mem{Base: Load(psrc.Base(), GP64()), Index: GP64(), Scale: 1} lut := Mem{Base: Load(plut, GP64())} rem := Load(psrc.Len(), GP64()) rsrc := YMM() rdst := YMM() nibh := YMM() nibl := YMM() emsk := YMM() roll := YMM() shfl := YMM() lutl := YMM() luth := YMM() lutr := YMM() zero := YMM() lo := YMM() hi := YMM() mask := VecBroadcast(U8(0x2f), YMM()) XORQ(dst.Index, dst.Index) XORQ(src.Index, src.Index) VPXOR(zero, zero, zero) VPERMQ(Imm(1<<6|1<<2), lut, lutr) VPERMQ(Imm(1<<6|1<<2), lut.Offset(16), lutl) VMOVDQA(lutHi, luth) Label("loop") load(src, rsrc) VPSRLD(Imm(4), rsrc, nibh) VPAND(mask, rsrc, nibl) VPSHUFB(nibl, lutl, lo) VPAND(mask, nibh, nibh) VPSHUFB(nibh, luth, hi) VPTEST(hi, lo) JNE(LabelRef("done")) VPCMPEQB(mask, rsrc, emsk) VPADDB(emsk, nibh, roll) VPSHUFB(roll, lutr, roll) VPADDB(rsrc, roll, shfl) VPMADDUBSW(madd1, shfl, shfl) VPMADDWD(madd2, shfl, shfl) VEXTRACTI128(Imm(1), shfl, rdst.AsX()) VPSHUFB(shufLo, rdst.AsX(), rdst.AsX()) VPSHUFB(shuf, shfl, shfl) VPBLENDD(Imm(8), rdst, shfl, rdst) VPBLENDD(Imm(192), zero, rdst, rdst) VMOVDQU(rdst, dst) ADDQ(Imm(24), dst.Index) ADDQ(Imm(32), src.Index) SUBQ(Imm(32), rem) CMPQ(rem, Imm(45)) JB(LabelRef("done")) JMP(LabelRef("loop")) Label("done") Store(dst.Index, ReturnIndex(0)) Store(src.Index, ReturnIndex(1)) VZEROUPPER() RET() } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/base64/encode_asm.go000066400000000000000000000053641452252572700264470ustar00rootroot00000000000000// +build ignore // // This code is a go assembly implementation of: // // Muła, Wojciech, & Lemire, Daniel (Thu, 14 Jun 2018). // Faster Base64 Encoding and Decoding Using AVX2 Instructions. // [arXiv:1704.00605](https://arxiv.org/abs/1704.00605) // // ...with changes to support multiple encodings. package main import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/segmentio/asm/build/internal/asm" . "github.com/segmentio/asm/build/internal/x86" ) func init() { ConstraintExpr("!purego") } func main() { TEXT("encodeAVX2", NOSPLIT, "func(dst, src []byte, lut *int8) (int, int)") dst := Mem{Base: Load(Param("dst").Base(), GP64()), Index: GP64(), Scale: 1} src := Mem{Base: Load(Param("src").Base(), GP64()), Index: GP64(), Scale: 1} lut := Mem{Base: Load(Param("lut"), GP64())} rem := Load(Param("src").Len(), GP64()) rsrc := YMM() rdst := YMM() msrc := YMM() shl4 := YMM() shl8 := YMM() blnd := YMM() mult := YMM() shfl := YMM() subs := YMM() cmps := YMM() xlat := YMM() xtab := YMM() xsub := VecBroadcast(U8(51), YMM()) xcmp := VecBroadcast(U8(25), YMM()) XORQ(dst.Index, dst.Index) XORQ(src.Index, src.Index) Comment("Load the 16-byte LUT into both lanes of the register") VPERMQ(Imm(1<<6|1<<2), lut, xtab) Comment("Load the first block using a mask to avoid potential fault") VMOVDQU(ConstLoadMask32("b64_enc_load", 0, 1, 1, 1, 1, 1, 1, 1, ), rsrc) VPMASKMOVD(src.Offset(-4), rsrc, rsrc) Label("loop") VPSHUFB(ConstBytes("b64_enc_shuf", []byte{ 5, 4, 6, 5, 8, 7, 9, 8, 11, 10, 12, 11, 14, 13, 15, 14, 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10, }), rsrc, rsrc) VPAND(ConstArray16("b64_enc_mask1", 0x03f0, 0x003f, 0x03f0, 0x003f, 0x03f0, 0x003f, 0x03f0, 0x003f, 0x03f0, 0x003f, 0x03f0, 0x003f, 0x03f0, 0x003f, 0x03f0, 0x003f, ), rsrc, msrc) VPSLLW(Imm(8), msrc, shl8) VPSLLW(Imm(4), msrc, shl4) VPBLENDW(Imm(170), shl8, shl4, blnd) VPAND(ConstArray16("b64_enc_mask2", 0xfc00, 0x0fc0, 0xfc00, 0x0fc0, 0xfc00, 0x0fc0, 0xfc00, 0x0fc0, 0xfc00, 0x0fc0, 0xfc00, 0x0fc0, 0xfc00, 0x0fc0, 0xfc00, 0x0fc0, ), rsrc, msrc) VPMULHUW(ConstArray16("b64_enc_mult", 0x0040, 0x0400, 0x0040, 0x0400, 0x0040, 0x0400, 0x0040, 0x0400, 0x0040, 0x0400, 0x0040, 0x0400, 0x0040, 0x0400, 0x0040, 0x0400, ), msrc, mult) VPOR(mult, blnd, shfl) VPSUBUSB(xsub, shfl, subs) VPCMPGTB(xcmp, shfl, cmps) VPSUBB(cmps, subs, subs) VPSHUFB(subs, xtab, xlat) VPADDB(shfl, xlat, rdst) VMOVDQU(rdst, dst) ADDQ(Imm(32), dst.Index) ADDQ(Imm(24), src.Index) SUBQ(Imm(24), rem) CMPQ(rem, Imm(32)) JB(LabelRef("done")) VMOVDQU(src.Offset(-4), rsrc) JMP(LabelRef("loop")) Label("done") Store(dst.Index, ReturnIndex(0)) Store(src.Index, ReturnIndex(1)) VZEROUPPER() RET() Generate() } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/bswap/000077500000000000000000000000001452252572700240435ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/bswap/swap64_asm.go000066400000000000000000000050031452252572700263540ustar00rootroot00000000000000// +build ignore package main import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/segmentio/asm/build/internal/asm" . "github.com/segmentio/asm/build/internal/x86" "github.com/mmcloughlin/avo/reg" "github.com/segmentio/asm/cpu" ) const unroll = 4 func init() { ConstraintExpr("!purego") } func main() { TEXT("swap64", NOSPLIT, "func(b []byte)") Doc("swap64 performs an in-place byte swap on each qword of the input buffer.") // Load slice ptr + length, and calculate end ptr. ptr := Load(Param("b").Base(), GP64()) len := Load(Param("b").Len(), GP64()) end := GP64() MOVQ(ptr, end) ADDQ(len, end) JumpUnlessFeature("x86_loop", cpu.AVX2) // Prepare the shuffle mask. shuffleMaskData := ConstBytes("shuffle_mask", []byte{ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, }) shuffleMask := YMM() VMOVDQU(shuffleMaskData, shuffleMask) // Loop while we have at least unroll*32 bytes remaining. Label("avx2_loop") next := GP64() MOVQ(ptr, next) ADDQ(Imm(unroll*32), next) CMPQ(next, end) JAE(LabelRef("x86_loop")) // Load multiple chunks => byte swap => store. var vectors [unroll]reg.VecVirtual for i := 0; i < unroll; i++ { vectors[i] = YMM() } for i := 0; i < unroll; i++ { VMOVDQU(Mem{Base: ptr}.Offset(i*32), vectors[i]) } for i := 0; i < unroll; i++ { VPSHUFB(shuffleMask, vectors[i], vectors[i]) } for i := 0; i < unroll; i++ { VMOVDQU(vectors[i], Mem{Base: ptr}.Offset(i*32)) } // Increment ptr and loop. MOVQ(next, ptr) JMP(LabelRef("avx2_loop")) // Loop while we have at least unroll*8 bytes remaining. Label("x86_loop") next = GP64() MOVQ(ptr, next) ADDQ(Imm(unroll*8), next) CMPQ(next, end) JAE(LabelRef("slow_loop")) // Load qwords => byte swap => store. var chunks [unroll]reg.GPVirtual for i := 0; i < unroll; i++ { chunks[i] = GP64() } for i := 0; i < unroll; i++ { MOVQ(Mem{Base: ptr}.Offset(i*8), chunks[i]) } for i := 0; i < unroll; i++ { BSWAPQ(chunks[i]) } for i := 0; i < unroll; i++ { MOVQ(chunks[i], Mem{Base: ptr}.Offset(i*8)) } // Increment ptr and loop. MOVQ(next, ptr) JMP(LabelRef("x86_loop")) // Loop until ptr reaches the end. Label("slow_loop") CMPQ(ptr, end) JAE(LabelRef("done")) // Load a qword => byte swap => store. qword := GP64() MOVQ(Mem{Base: ptr}, qword) BSWAPQ(qword) MOVQ(qword, Mem{Base: ptr}) // Increment ptr and loop. ADDQ(Imm(8), ptr) JMP(LabelRef("slow_loop")) Label("done") RET() Generate() } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/go.mod000066400000000000000000000003151452252572700240340ustar00rootroot00000000000000module github.com/segmentio/asm/build go 1.16 require ( github.com/mmcloughlin/avo v0.4.0 github.com/segmentio/asm v0.0.3 golang.org/x/sys v0.1.0 // indirect golang.org/x/tools v0.1.12 // indirect ) golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/go.sum000066400000000000000000000112751452252572700240700ustar00rootroot00000000000000github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI= github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/mmcloughlin/avo v0.4.0 h1:jeHDRktVD+578ULxWpQHkilor6pkdLF7u7EiTzDbfcU= github.com/mmcloughlin/avo v0.4.0/go.mod h1:RW9BfYA3TgO9uCdNrKU2h6J8cPD8ZLznvfgHAeszb1s= github.com/segmentio/asm v0.0.3 h1:ciVBxfM3cIEuGR1VDXPCxdz+Qo+wxlrrt7AkycMJcts= github.com/segmentio/asm v0.0.3/go.mod h1:4EUJGaKsB8ImLUwOGORVsNd9vTRDeh44JGsY4aKp5I4= github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 h1:6zppjxzCulZykYSLyVDYbneBfbaBIQPYMevg0bEwv2s= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.7/go.mod h1:LGqMHiF4EqQNHR1JncWGqT5BVaXmza+X+BDGol+dOxo= golang.org/x/tools v0.1.12 h1:VveCTK38A2rkS8ZqFY25HIDFscX5X9OoEhJd3quQmXU= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/internal/000077500000000000000000000000001452252572700245435ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/internal/asm/000077500000000000000000000000001452252572700253235ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/internal/asm/asm.go000066400000000000000000000045441452252572700264410ustar00rootroot00000000000000package asm import ( "encoding/binary" . "github.com/mmcloughlin/avo/build" "github.com/mmcloughlin/avo/operand" ) func ConstBytes(name string, data []byte) operand.Mem { m := GLOBL(name, RODATA|NOPTR) switch { case len(data)%8 == 0: constBytes8(0, data) case len(data)%4 == 0: constBytes4(0, data) default: i := (len(data) / 8) * 8 constBytes8(0, data[:i]) constBytes1(i, data[i:]) } return m } func ConstArray16(name string, elems ...uint16) operand.Mem { data := make([]byte, 2*len(elems)) for i, elem := range elems { binary.LittleEndian.PutUint16(data[i*2:], elem) } return ConstBytes(name, data) } func ConstArray32(name string, elems ...uint32) operand.Mem { data := make([]byte, 4*len(elems)) for i, elem := range elems { binary.LittleEndian.PutUint32(data[i*4:], elem) } return ConstBytes(name, data) } func ConstArray64(name string, elems ...uint64) operand.Mem { data := make([]byte, 8*len(elems)) for i, elem := range elems { binary.LittleEndian.PutUint64(data[i*8:], elem) } return ConstBytes(name, data) } func ConstShuffleMask32(name string, indices ...uint32) operand.Mem { data := make([]byte, 4*len(indices)) for i, index := range indices { for j := 0; j < 4; j++ { data[i*4+j] = byte(index*4 + uint32(j)) } } return ConstBytes(name, data) } func ConstShuffleMask64(name string, indices ...uint64) operand.Mem { data := make([]byte, 8*len(indices)) for i, index := range indices { for j := 0; j < 8; j++ { data[i*8+j] = byte(index*8 + uint64(j)) } } return ConstBytes(name, data) } func ConstLoadMask32(name string, indices ...uint32) operand.Mem { data := make([]uint32, len(indices)) for i, index := range indices { data[i] = index << 31 } return ConstArray32(name, data...) } func ConstLoadMask64(name string, indices ...uint64) operand.Mem { data := make([]uint64, len(indices)) for i, index := range indices { data[i] = index << 63 } return ConstArray64(name, data...) } func constBytes8(offset int, data []byte) { for i := 0; i < len(data); i += 8 { DATA(offset+i, operand.U64(binary.LittleEndian.Uint64(data[i:i+8]))) } } func constBytes4(offset int, data []byte) { for i := 0; i < len(data); i += 4 { DATA(offset+i, operand.U32(binary.LittleEndian.Uint32(data[i:i+4]))) } } func constBytes1(offset int, data []byte) { for i, b := range data { DATA(offset+i, operand.U8(b)) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/internal/x86/000077500000000000000000000000001452252572700251705ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/internal/x86/bytes.go000066400000000000000000000135011452252572700266450ustar00rootroot00000000000000package x86 import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" "github.com/segmentio/asm/cpu" ) type Memory struct { Size int Index Register Offset int } func (m Memory) Resolve(base Register) Mem { memory := Mem{Base: base, Disp: m.Offset, Scale: 1} if m.Index != nil { memory.Index = m.Index } return memory } func (m Memory) Load(base Register) Register { r := GetRegister(m.Size) m.mov(m.Resolve(base), r) return r } func (m Memory) Store(src Register, base Register) { m.mov(src, m.Resolve(base)) } func (m Memory) mov(src, dst Op) { switch m.Size { case 1: MOVB(src, dst) case 2: MOVW(src, dst) case 4: MOVL(src, dst) case 8: MOVQ(src, dst) case 16: MOVOU(src, dst) case 32: VMOVDQU(src, dst) } } func GetRegister(size int) (r Register) { switch size { case 1: r = GP8() case 2: r = GP16() case 4: r = GP32() case 8: r = GP64() case 16: r = XMM() case 32: r = YMM() default: panic("bad register size") } return } func BinaryOpTable(B, W, L, Q, X func(Op, Op), VEX func(Op, Op, Op)) []func(Op, Op) { return []func(Op, Op){ 1: B, 2: W, 4: L, 8: Q, 16: X, 32: func(src, dst Op) { VEX(src, dst, dst) }, } } func GenerateCopy(name, doc string, transform []func(Op, Op)) { TEXT(name, NOSPLIT, "func(dst, src []byte) int") Doc(name + " " + doc) dst := Load(Param("dst").Base(), GP64()) src := Load(Param("src").Base(), GP64()) n := Load(Param("dst").Len(), GP64()) x := Load(Param("src").Len(), GP64()) CMPQ(x, n) CMOVQLT(x, n) Store(n, ReturnIndex(0)) VariableLengthBytes{ Unroll: 128, Process: func(regs []Register, memory ...Memory) { src, dst := regs[0], regs[1] count := len(memory) operands := make([]Op, count*2) for i, m := range memory { operands[i] = m.Load(src) if transform != nil { if m.Size == 32 { // For AVX2, avoid loading the destination into a register // before transforming it; pass the memory argument directly // to the transform instruction. operands[i+count] = m.Resolve(dst) } else { operands[i+count] = m.Load(dst) } } } if transform != nil { for i, m := range memory { transform[m.Size](operands[i+count], operands[i]) } } for i, m := range memory { m.Store(operands[i].(Register), dst) } }, }.Generate([]Register{src, dst}, n) } type VariableLengthBytes struct { SetupXMM func() SetupYMM func() Process func(inputs []Register, memory ...Memory) Unroll int } func (v VariableLengthBytes) Generate(inputs []Register, n Register) { unroll := uint64(v.Unroll) if unroll != 128 && unroll != 256 { panic("unsupported unroll") } Label("start") if v.SetupXMM != nil { CMPQ(n, Imm(16)) JBE(LabelRef("tail")) v.SetupXMM() } Label("tail") CMPQ(n, Imm(0)) JE(LabelRef("done")) CMPQ(n, Imm(1)) JE(LabelRef("handle1")) CMPQ(n, Imm(3)) JBE(LabelRef("handle2to3")) CMPQ(n, Imm(4)) JE(LabelRef("handle4")) CMPQ(n, Imm(8)) JB(LabelRef("handle5to7")) JE(LabelRef("handle8")) CMPQ(n, Imm(16)) JBE(LabelRef("handle9to16")) CMPQ(n, Imm(32)) JBE(LabelRef("handle17to32")) CMPQ(n, Imm(64)) JBE(LabelRef("handle33to64")) JumpUnlessFeature("generic", cpu.AVX2) if v.SetupYMM != nil { v.SetupYMM() } CMPQ(n, U32(unroll)) JB(LabelRef("avx2_tail")) JMP(LabelRef("avx2")) Label("generic") v.Process(inputs, Memory{Size: 16}, Memory{Size: 16, Offset: 16}, Memory{Size: 16, Offset: 32}, Memory{Size: 16, Offset: 48}, ) for i := range inputs { ADDQ(Imm(64), inputs[i]) } SUBQ(Imm(64), n) CMPQ(n, Imm(64)) JBE(LabelRef("tail")) JMP(LabelRef("generic")) Label("done") RET() Label("handle1") v.Process(inputs, Memory{Size: 1}) RET() Label("handle2to3") v.Process(inputs, Memory{Size: 2}, Memory{Size: 2, Index: n, Offset: -2}) RET() Label("handle4") v.Process(inputs, Memory{Size: 4}) RET() Label("handle5to7") v.Process(inputs, Memory{Size: 4}, Memory{Size: 4, Index: n, Offset: -4}) RET() Label("handle8") v.Process(inputs, Memory{Size: 8}) RET() Label("handle9to16") v.Process(inputs, Memory{Size: 8}, Memory{Size: 8, Index: n, Offset: -8}) RET() Label("handle17to32") v.Process(inputs, Memory{Size: 16}, Memory{Size: 16, Index: n, Offset: -16}) RET() Label("handle33to64") v.Process(inputs, Memory{Size: 16}, Memory{Size: 16, Offset: 16}, Memory{Size: 16, Index: n, Offset: -32}, Memory{Size: 16, Index: n, Offset: -16}) RET() // We have at least `unroll` bytes. Comment("AVX optimized version for medium to large size inputs.") Label("avx2") var memory []Memory for i := 0; i < int(unroll / 32); i++ { memory = append(memory, Memory{Size: 32, Offset: i * 32}) } v.Process(inputs, memory...) for i := range inputs { ADDQ(U32(unroll), inputs[i]) } SUBQ(U32(unroll), n) JZ(LabelRef("avx2_done")) CMPQ(n, U32(unroll)) JAE(LabelRef("avx2")) // We have between [1, unroll) bytes. Label("avx2_tail") CMPQ(n, Imm(64)) JBE(LabelRef("avx2_tail_1to64")) if unroll == 256 { CMPQ(n, Imm(128)) JBE(LabelRef("avx2_tail_65to128")) Label("avx2_tail_129to256") v.Process(inputs, Memory{Size: 32}, Memory{Size: 32, Offset: 32}, Memory{Size: 32, Offset: 64}, Memory{Size: 32, Offset: 96}, Memory{Size: 32, Index: n, Offset: -128}, Memory{Size: 32, Index: n, Offset: -96}, Memory{Size: 32, Index: n, Offset: -64}, Memory{Size: 32, Index: n, Offset: -32}) JMP(LabelRef("avx2_done")) } Label("avx2_tail_65to128") v.Process(inputs, Memory{Size: 32}, Memory{Size: 32, Offset: 32}, Memory{Size: 32, Index: n, Offset: -64}, Memory{Size: 32, Index: n, Offset: -32}) JMP(LabelRef("avx2_done")) Label("avx2_tail_1to64") v.Process(inputs, Memory{Size: 32, Index: n, Offset: -64}, Memory{Size: 32, Index: n, Offset: -32}) Label("avx2_done") VZEROUPPER() RET() Generate() } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/internal/x86/cpu.go000066400000000000000000000030611452252572700263060ustar00rootroot00000000000000package x86 import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" "math" "math/bits" "github.com/segmentio/asm/cpu" ) // JumpIfFeature constructs a jump sequence that tests for one or more feature flags. // If all flags are matched, jump to the target label. func JumpIfFeature(jmp string, f cpu.X86Feature) { jump(LabelRef(jmp), f, false) } // JumpUnlessFeature constructs a jump sequence that tests for one or more feature flags. // Unless all flags are matched, jump to the target label. func JumpUnlessFeature(jmp string, f cpu.X86Feature) { jump(LabelRef(jmp), f, true) } // cpuAddr is a Mem operand containing the global symbolic reference to the // X86 cpu feature flags. var cpuAddr = NewDataAddr(Symbol{Name: "github·com∕segmentio∕asm∕cpu·X86"}, 0) func jump(jmp Op, f cpu.X86Feature, invert bool) { if bits.OnesCount64(uint64(f)) == 1 { // If the feature test is for a single flag, optimize the test using BTQ jumpSingleFlag(jmp, f, invert) } else { jumpMultiFlag(jmp, f, invert) } } func jumpSingleFlag(jmp Op, f cpu.X86Feature, invert bool) { bit := U8(bits.TrailingZeros64(uint64(f))) // Likely only need lower 4 bytes if bit < 32 { BTL(bit, cpuAddr) } else { BTQ(bit, cpuAddr) } if invert { JCC(jmp) } else { JCS(jmp) } } func jumpMultiFlag(jmp Op, f cpu.X86Feature, invert bool) { r := GP64() MOVQ(cpuAddr, r) var op Op if f <= math.MaxUint32 { op = U32(f) } else { op = GP64() MOVQ(U64(f), op) } ANDQ(op, r) CMPQ(r, op) if invert { JNE(jmp) } else { JEQ(jmp) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/internal/x86/reg.go000066400000000000000000000174731452252572700263100ustar00rootroot00000000000000package x86 import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" ) var all = map[Spec][]VecPhysical{ S128: []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, X20, X21, X22, X23, X24, X25, X26, X27, X28, X29, X30, X31}, S256: []VecPhysical{Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15, Y16, Y17, Y18, Y19, Y20, Y21, Y22, Y23, Y24, Y25, Y26, Y27, Y28, Y29, Y30, Y31}, S512: []VecPhysical{Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z8, Z9, Z10, Z11, Z12, Z13, Z14, Z15, Z16, Z17, Z18, Z19, Z20, Z21, Z22, Z23, Z24, Z25, Z26, Z27, Z28, Z29, Z30, Z31}, } func gp(c func() GPVirtual, g ...Register) Register { if len(g) == 0 { return c() } return g[0] } // load either emits a size-specific MOV operation based on the input immediate // value, or returns the existing register. The optional destination register // must be a general purpose register if provided. If not provided, a virtual // register will be allocated if needed. func load(ir Op, dst ...Register) (Register, uint) { switch v := ir.(type) { default: panic("unsupported input operand") case U8, I8: g := gp(GP32, dst...) MOVB(v, g.(GP).As8()) return g, 1 case U16, I16: g := gp(GP32, dst...) MOVW(v, g.(GP).As16()) return g, 2 case U32, I32: g := gp(GP32, dst...) MOVL(v, g) return g, 4 case U64, I64: g := gp(GP64, dst...) MOVQ(v, g) return g, 8 case Register: return v, v.Size() } } // VecList returns a slice of vector registers for the given Spec. func VecList(s Spec, max int) []VecPhysical { return all[s][:max] } // VecBroadcast broadcasts an immediate or general purpose register into a // vector register. The broadcast size is based on the input operand size. // If the input is a register, it may be necessary to convert it to the // desired size. For example: // reg := GP32() // XORL(reg, reg) // MOVB(U8(0x7F), reg.As8()) // mask := VecBroadcast(reg, XMM()) // will broadcast 0x0000007F0000007F0000007F0000007F // mask := VecBroadcast(reg.As8(), XMM()) // will broadcast 0x7F7F7F7F7F7F7F7F7F7F7F7F7F7F7F7F // // If the `reg` register isn't needed, it would preferrable to use: // mask := VecBroadcast(U8(0x7F), XMM()) func VecBroadcast(ir Op, xyz Register) Register { vec := xyz.(Vec) reg, size := load(ir) // PINSR{B,W} accept either m{8,16} or r32. If the input was // r{8,16} we need to cast to 32 bits. if reg.Size() < 4 { if gp, ok := reg.(GP); ok { reg = gp.As32() } else { r32 := GP32() switch reg.Size() { case 1: MOVBLZX(reg, r32) case 2: MOVWLZX(reg, r32) } reg = r32 } } switch size { default: panic("unsupported register size") case 1: PINSRB(Imm(0), reg, vec.AsX()) VPBROADCASTB(vec.AsX(), xyz) case 2: PINSRW(Imm(0), reg, vec.AsX()) VPBROADCASTW(vec.AsX(), xyz) case 4: PINSRD(Imm(0), reg, vec.AsX()) VPBROADCASTD(vec.AsX(), xyz) case 8: PINSRQ(Imm(0), reg, vec.AsX()) VPBROADCASTQ(vec.AsX(), xyz) } return xyz } // VectorLane is an interface for abstracting allocating and loading memory into // vector registers. This is used during the map phase of the Vectorizer. type VectorLane interface { Read(Mem) Register Offset(Mem) Mem Alloc() Register } // Vectorizer is a map/reduce-based helper for constructing parallelized // instruction pipelines. type Vectorizer struct { max int // total registers allowed mapper func(VectorLane) Register // function to map the main operation to an output register reducer func(a, b Register) Register // function to reduce the mapped output registers into one } // NewVectorizer creates a new vectorizing utility utilizing a max number of // registers and a mapper function. func NewVectorizer(max int, mapper func(VectorLane) Register) *Vectorizer { return &Vectorizer{max: max, mapper: mapper} } // Reduce sets the reducer function in the Vectorizer. func (v *Vectorizer) Reduce(h func(a, b Register) Register) *Vectorizer { v.reducer = h return v } // Compile runs the map and reduce phases for the given register size and // parallel lane count. This can be called multiple times using different // configurations to produce separate execution strides. The returned slice // is dependent on the presence of a reducer. If no reducer is used, the // slice will be all of the output registers from the map phase. If a reducer // is defined, the result is slice containing the final reduced register. func (v *Vectorizer) Compile(spec Spec, lanes int) []Register { alloc := NewVectorAlloc(VecList(spec, v.max), lanes) var out []Register for alloc.NextLane() { r := v.mapper(alloc) out = append(out, r) } if v.reducer != nil { for len(out) > 1 { r := v.reducer(out[0], out[1]) out = append(out[2:], r) } } return out } // ReduceOr performs a bitwise OR between two registers and returns the result. // This can be used as the reducer for a Vectorizer. func ReduceOr(a, b Register) Register { VPOR(b, a, a) return a } // ReduceAnd performs a bitwise AND between two registers and returns the result. // This can be used as the reducer for a Vectorizer. func ReduceAnd(a, b Register) Register { VPAND(b, a, a) return a } // VectorAlloc is a lower-level lane-driven register allocator. This pulls // registers from a fixed list of physical registers for a given number of // lanes. Registers are allocated in distinct blocks; one block for each lane. type VectorAlloc struct { vec []VecPhysical // all available physical registers rd map[Mem]vecRead // loaded register index off map[Mem]int // offset index lanes int // number of lanes being compiled lane int // current lane being compiled size int // register size } type vecRead struct { reg []Register idx int } // NewVectorAlloc creates a new VectorAlloc instance. func NewVectorAlloc(vec []VecPhysical, lanes int) *VectorAlloc { return &VectorAlloc{ vec: vec, rd: map[Mem]vecRead{}, off: map[Mem]int{}, lanes: lanes, lane: -1, size: int(vec[0].Size()), } } // NextLane is used to advance the allocator to the next lane available lane. // This returns false when no more lanes are available. func (a *VectorAlloc) NextLane() bool { next := a.lane + 1 if next < a.lanes { a.lane = next return true } return false } // Read implements the VectorLane interface. This reads the next register-sized // memory region. Each read within a single lane will load adjacent memory // regions. Subsequent lanes will read adjacent memory after the last read of // the prior lane. This has special handling so that all reads are batched // together. Because of this, Read calls should appear first in the mapper. func (a *VectorAlloc) Read(mem Mem) Register { if _, ok := a.off[mem]; ok { panic("Offset and Read cannot current be combined for the same memory region") } rd := a.rd[mem] if a.lane == 0 { for i := 0; i < a.lanes; i++ { r := a.Alloc() VMOVDQU(mem.Offset(len(rd.reg)*a.size), r) rd.reg = append(rd.reg, r) } } r := rd.reg[rd.idx] rd.idx++ a.rd[mem] = rd return r } // Offset implements the VectorLane interface. This calculates the next // register-sized memory offset. Each offset within a single lane will refer // to adjacent memory regions. Subsequent lanes will obtain an offset of // adjacent memory after the last offset of the prior lane. func (a *VectorAlloc) Offset(mem Mem) Mem { if _, ok := a.rd[mem]; ok { panic("Read and Offset cannot current be combined for the same memory region") } n := a.off[mem] a.off[mem] = n + 1 return mem.Offset(n * a.size) } // Alloc implements the VectorLane interface. This allocates a register for the // current lane. func (a *VectorAlloc) Alloc() Register { if len(a.vec) == 0 { panic("not enough vector registers available") } r := a.vec[0] a.vec = a.vec[1:] return r } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/keyset/000077500000000000000000000000001452252572700242335ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/keyset/keyset_asm.go000066400000000000000000000114511452252572700267300ustar00rootroot00000000000000//go:build ignore // +build ignore package main import ( "fmt" "math" . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" . "github.com/segmentio/asm/build/internal/asm" ) const ( unroll = 4 pageSize = 4096 maxLength = 16 ) func init() { ConstraintExpr("!purego") } func main() { Lookup() Generate() } // Lookup searches for a key in a set of keys. // // Each key in the set of keys should be padded to 16 bytes and concatenated // into a single buffer. The routine searches for the input key in the set of // keys and returns its index if found. If not found, the routine returns the // number of keys (len(keyset)/16). func Lookup() { TEXT("Lookup", NOSPLIT, "func(keyset []byte, key []byte) int") Doc("Lookup searches for a key in a set of keys, returning its index if ", "found. If the key cannot be found, the number of keys is returned.") // Load inputs. keyset := Load(Param("keyset").Base(), GP64()) count := Load(Param("keyset").Len(), GP64()) SHRQ(Imm(4), count) keyPtr := Load(Param("key").Base(), GP64()) keyLen := Load(Param("key").Len(), GP64()) keyCap := Load(Param("key").Cap(), GP64()) // None of the keys are larger than maxLength. CMPQ(keyLen, Imm(maxLength)) JA(LabelRef("not_found")) // We're going to be unconditionally loading 16 bytes from the input key // so first check if it's safe to do so (cap >= 16). If not, defer to // safe_load for additional checks. CMPQ(keyCap, Imm(maxLength)) JB(LabelRef("safe_load")) // Load the input key and pad with zeroes to 16 bytes. Label("load") key := XMM() VMOVUPS(Mem{Base: keyPtr}, key) Label("prepare") zeroes := XMM() VPXOR(zeroes, zeroes, zeroes) ones := XMM() VPCMPEQB(ones, ones, ones) var blendBytes [maxLength * 2]byte for j := 0; j < maxLength; j++ { blendBytes[j] = 0xFF } blendMasks := ConstBytes("blend_masks", blendBytes[:]) blendMasksPtr := GP64() LEAQ(blendMasks.Offset(maxLength), blendMasksPtr) SUBQ(keyLen, blendMasksPtr) blend := XMM() VMOVUPS(Mem{Base: blendMasksPtr}, blend) VPBLENDVB(blend, key, zeroes, key) // Zero out i so we can use it as the loop increment. i := GP64() XORQ(i, i) // Round the key count down to the nearest multiple of unroll to determine // how many iterations of the big loop we'll need. truncatedCount := GP64() MOVQ(count, truncatedCount) shift := uint64(math.Log2(float64(unroll))) SHRQ(Imm(shift), truncatedCount) SHLQ(Imm(shift), truncatedCount) // Loop over multiple keys in the big loop. Label("bigloop") CMPQ(i, truncatedCount) JE(LabelRef("loop")) x := []VecPhysical{X8, X9, X10, X11, X12, X13, X14, X15} for n := 0; n < unroll; n++ { VPCMPEQB(Mem{Base: keyset, Disp: maxLength * n}, key, x[n]) VPTEST(ones, x[n]) var target string if n == 0 { target = "done" } else { target = fmt.Sprintf("found%d", n) } JCS(LabelRef(target)) } // Advance and loop again. ADDQ(Imm(unroll), i) ADDQ(Imm(unroll*maxLength), keyset) JMP(LabelRef("bigloop")) // Loop over the remaining keys. Label("loop") CMPQ(i, count) JE(LabelRef("done")) // Try to match against the input key. match := XMM() VPCMPEQB(Mem{Base: keyset}, key, match) VPTEST(ones, match) JCS(LabelRef("done")) // Advance and loop again. Label("next") INCQ(i) ADDQ(Imm(maxLength), keyset) JMP(LabelRef("loop")) JMP(LabelRef("done")) // Return the loop increment, or the count if the key wasn't found. If we're // here from a jump within the big loop, the loop increment needs // correcting first. for j := unroll - 1; j > 0; j-- { Label(fmt.Sprintf("found%d", j)) INCQ(i) } Label("done") Store(i, ReturnIndex(0)) RET() Label("not_found") Store(count, ReturnIndex(0)) RET() // If the input key is near a page boundary, we must change the way we load // it to avoid a fault. We instead want to load the 16 bytes up to and // including the key, then shuffle the key forward in the register. E.g. for // key "foo" we would load the 13 bytes prior to the key along with "foo" // and then move the last 3 bytes forward so the first 3 bytes are equal // to "foo". Label("safe_load") pageOffset := GP64() MOVQ(keyPtr, pageOffset) ANDQ(U32(pageSize-1), pageOffset) CMPQ(pageOffset, U32(pageSize-maxLength)) JBE(LabelRef("load")) // Not near a page boundary. offset := GP64() MOVQ(^U64(0)-maxLength+1, offset) ADDQ(keyLen, offset) VMOVUPS(Mem{Base: keyPtr, Index: offset, Scale: 1}, key) var shuffleBytes [maxLength * 2]byte for j := 0; j < maxLength; j++ { shuffleBytes[j] = byte(j) shuffleBytes[j+maxLength] = byte(j) } shuffleMasks := ConstBytes("shuffle_masks", shuffleBytes[:]) shuffleMasksPtr := GP64() LEAQ(shuffleMasks.Offset(maxLength), shuffleMasksPtr) SUBQ(keyLen, shuffleMasksPtr) shuffle := XMM() VMOVUPS(Mem{Base: shuffleMasksPtr}, shuffle) VPSHUFB(shuffle, key, key) JMP(LabelRef("prepare")) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/mem/000077500000000000000000000000001452252572700235055ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/mem/blend_asm.go000066400000000000000000000005321452252572700257600ustar00rootroot00000000000000// +build ignore package main import ( . "github.com/mmcloughlin/avo/build" "github.com/segmentio/asm/build/internal/x86" ) func init() { ConstraintExpr("!purego") } func main() { x86.GenerateCopy("Blend", "copies the one-bits of src to dst, returning the number of bytes written.", x86.BinaryOpTable(ORB, ORW, ORL, ORQ, POR, VPOR)) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/mem/contains_asm.go000066400000000000000000000073741452252572700265250ustar00rootroot00000000000000// +build ignore package main import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" . "github.com/segmentio/asm/build/internal/x86" ) func init() { ConstraintExpr("!purego") } func main() { TEXT("ContainsByte", NOSPLIT, "func(haystack []byte, needle byte) bool") haystack := Load(Param("haystack").Base(), GP64()) length := Load(Param("haystack").Len(), GP64()) // Broadcast the needle byte to each 8 bytes in a GP64. needle := GP64() XORQ(needle, needle) Load(Param("needle"), needle.As8()) tmp := GP64() for i := 3; i <= 5; i++ { MOVQ(needle, tmp) SHLQ(U8(1< 1 { ops = append(ops[2:], op(ops[0], ops[1])) } return ops[0] } func binary(ins func(Op, Op)) func(Op, Op) Op { return func(src Op, dst Op) Op { ins(src, dst) return dst } } func vex(ins func(Op, Op, Op)) func(Op, Op) Op { return func(src Op, dst Op) Op { ins(src, dst, dst) return dst } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/mem/copy_asm.go000066400000000000000000000004311452252572700256440ustar00rootroot00000000000000// +build ignore package main import ( . "github.com/mmcloughlin/avo/build" "github.com/segmentio/asm/build/internal/x86" ) func init() { ConstraintExpr("!purego") } func main() { x86.GenerateCopy("Copy", "copies src to dst, returning the number of bytes written.", nil) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/mem/count_pair_asm.go000066400000000000000000000172531452252572700270470ustar00rootroot00000000000000// +build ignore package main import ( "fmt" "math/bits" . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" . "github.com/segmentio/asm/build/internal/x86" "github.com/segmentio/asm/cpu" ) func init() { ConstraintExpr("!purego") } func main() { generateCountPair(countPair1{}) generateCountPair(countPair2{}) generateCountPair(countPair4{}) generateCountPair(countPair8{}) generateCountPair(countPair16{}) generateCountPair(countPair32{}) Generate() } type countPair interface { size() int test(a, b Mem) } type countPairAVX2 interface { countPair vpcmpeq(src0, src1, dst VecVirtual) vpmovmskb(tmp, src VecVirtual, dst Register) } type countPair1 struct{} func (countPair1) size() int { return 1 } func (countPair1) test(a, b Mem) { generateCountPairTest(MOVB, CMPB, GP8, a, b) } func (countPair1) vpcmpeq(a, b, c VecVirtual) { VPCMPEQB(a, b, c) } func (countPair1) vpmovmskb(_, a VecVirtual, b Register) { VPMOVMSKB(a, b) } type countPair2 struct{} func (countPair2) size() int { return 2 } func (countPair2) test(a, b Mem) { generateCountPairTest(MOVW, CMPW, GP16, a, b) } func (countPair2) vpcmpeq(a, b, c VecVirtual) { VPCMPEQW(a, b, c) } func (countPair2) vpmovmskb(_, a VecVirtual, b Register) { VPMOVMSKB(a, b) } type countPair4 struct{} func (countPair4) size() int { return 4 } func (countPair4) test(a, b Mem) { generateCountPairTest(MOVL, CMPL, GP32, a, b) } func (countPair4) vpcmpeq(a, b, c VecVirtual) { VPCMPEQD(a, b, c) } func (countPair4) vpmovmskb(_, a VecVirtual, b Register) { VPMOVMSKB(a, b) } type countPair8 struct{} func (countPair8) size() int { return 8 } func (countPair8) test(a, b Mem) { generateCountPairTest(MOVQ, CMPQ, GP64, a, b) } func (countPair8) vpcmpeq(a, b, c VecVirtual) { VPCMPEQQ(a, b, c) } func (countPair8) vpmovmskb(_, a VecVirtual, b Register) { VPMOVMSKB(a, b) } type countPair16 struct{} func (countPair16) size() int { return 16 } func (countPair16) test(a, b Mem) { r0, r1 := XMM(), XMM() MOVOU(a, r0) MOVOU(b, r1) mask := GP32() PCMPEQQ(r0, r1) PMOVMSKB(r1, mask) CMPL(mask, U32(0xFFFF)) } func (countPair16) vpcmpeq(a, b, c VecVirtual) { VPCMPEQQ(a, b, c) } func (countPair16) vpmovmskb(tmp, src VecVirtual, dst Register) { // https://www.felixcloutier.com/x86/vpermq#vpermq--vex-256-encoded-version- // // Swap each quad word in the lower and upper half of the 32 bytes register, // then AND the src and tmp registers to zero each halves that were partial // equality; only fully equal 128 bits need to result in setting 1 bits in // the destination mask. const permutation = (1 << 0) | (0 << 2) | (3 << 4) | (2 << 6) VPERMQ(Imm(permutation), src, tmp) VPAND(src, tmp, tmp) VPMOVMSKB(tmp, dst) } type countPair32 struct{} func (countPair32) size() int { return 32 } func (countPair32) test(a, b Mem) { r0, r1, r2, r3 := XMM(), XMM(), XMM(), XMM() MOVOU(a, r0) MOVOU(a.Offset(16), r1) MOVOU(b, r2) MOVOU(b.Offset(16), r3) mask0, mask1 := GP32(), GP32() PCMPEQQ(r0, r2) PCMPEQQ(r1, r3) PMOVMSKB(r2, mask0) PMOVMSKB(r3, mask1) ANDL(mask1, mask0) CMPL(mask0, U32(0xFFFF)) } func (countPair32) vpcmpeq(a, b, c VecVirtual) { VPCMPEQQ(a, b, c) } func (countPair32) vpmovmskb(_, src VecVirtual, dst Register) { VPMOVMSKB(src, dst) } func generateCountPair(code countPair) { size := code.size() TEXT(fmt.Sprintf("countPair%d", size), NOSPLIT, "func(b []byte) int") p := Load(Param("b").Base(), GP64()) n := Load(Param("b").Len(), GP64()) r := GP64() XORQ(r, r) SUBQ(Imm(uint64(size)), n) if _, ok := code.(countPairAVX2); ok { JumpIfFeature("avx2", cpu.AVX2) } Label("tail") CMPQ(n, Imm(0)) JLE(LabelRef("done")) Label("generic") x := GP64() MOVQ(r, x) INCQ(x) code.test(Mem{Base: p}, (Mem{Base: p}).Offset(size)) CMOVQEQ(x, r) ADDQ(Imm(uint64(size)), p) SUBQ(Imm(uint64(size)), n) CMPQ(n, Imm(0)) JG(LabelRef("generic")) Label("done") Store(r, ReturnIndex(0)) RET() if avx, ok := code.(countPairAVX2); ok { const avxChunk = 256 const avxLanes = avxChunk / 32 Label("avx2") CMPQ(n, U32(avxChunk+uint64(size))) JL(LabelRef(fmt.Sprintf("avx2_tail%d", avxChunk/2))) masks := make([]GPVirtual, avxLanes) for i := range masks { masks[i] = GP64() XORQ(masks[i], masks[i]) } regA := make([]VecVirtual, avxLanes) regB := make([]VecVirtual, avxLanes) for i := range regA { regA[i] = YMM() regB[i] = YMM() } Label(fmt.Sprintf("avx2_loop%d", avxChunk)) generateCountPairAVX2(r, p, regA, regB, masks, avx) ADDQ(U32(avxChunk), p) SUBQ(U32(avxChunk), n) CMPQ(n, U32(avxChunk+uint64(size))) JGE(LabelRef(fmt.Sprintf("avx2_loop%d", avxChunk))) for chunk := avxChunk / 2; chunk >= 32; chunk /= 2 { Label(fmt.Sprintf("avx2_tail%d", chunk)) CMPQ(n, Imm(uint64(chunk+size))) JL(LabelRef(fmt.Sprintf("avx2_tail%d", chunk/2))) lanes := chunk / 32 generateCountPairAVX2(r, p, regA[:lanes], regB[:lanes], masks[:lanes], avx) ADDQ(U32(uint64(chunk)), p) SUBQ(U32(uint64(chunk)), n) } Label("avx2_tail16") if size < 16 { CMPQ(n, Imm(uint64(16+size))) JL(LabelRef("avx2_tail")) generateCountPairAVX2(r, p, []VecVirtual{XMM()}, []VecVirtual{XMM()}, masks[:1], avx) ADDQ(Imm(16), p) SUBQ(Imm(16), n) } Label("avx2_tail") VZEROUPPER() if size < 32 { if shift := divideShift(size); shift > 0 { SHRQ(Imm(uint64(shift)), r) } } JMP(LabelRef("tail")) } } func generateCountPairTest(mov func(Op, Op), cmp func(Op, Op), reg func() GPVirtual, a, b Mem) { r := reg() mov(a, r) cmp(r, b) } func generateCountPairAVX2(r, p Register, regA, regB []VecVirtual, masks []GPVirtual, code countPairAVX2) { size := code.size() moves := make(map[int]VecVirtual) for i, reg := range regA { VMOVDQU((Mem{Base: p}).Offset(i*32), reg) moves[i*32] = reg } for i, reg := range regB { // Skip loading from memory a second time if we already loaded the // offset in the previous loop. This optimization applies for items // of size 32. if moves[i*32+size] == nil { lo := moves[i*32+(size-16)] hi := moves[i*32+(size+16)] if lo != nil && hi != nil { // https://www.felixcloutier.com/x86/vperm2i128#vperm2i128 // // The data was already loaded, but split across two registers. // We recompose it using a permutation of the upper and lower // halves of the registers holding the contiguous data. // // Note that in Go assembly the arguments are reversed; // SRC1 is `lo` and SRC2 is `hi`, but we pass them in the // reverse order. const permutation = (1 << 0) | (2 << 4) VPERM2I128(Imm(permutation), hi, lo, reg) } else { VMOVDQU((Mem{Base: p}).Offset(i*32+size), reg) } } } for i := range regA { // The load may have been elided if there was offset overlaps between // the two sources. if mov := moves[i*32+size]; mov != nil { code.vpcmpeq(regA[i], mov, regB[i]) } else { code.vpcmpeq(regA[i], regB[i], regB[i]) } code.vpmovmskb(regA[i], regB[i], masks[i].As32()) } for _, mask := range masks { POPCNTQ(mask, mask) if size == 32 { SHRQ(Imm(uint64(divideShift(size))), mask) } } ADDQ(divideAndConquerSum(masks), r) } func divideShift(size int) int { return bits.TrailingZeros(uint(size)) } func divideAndConquerSum(regs []GPVirtual) GPVirtual { switch len(regs) { case 1: return regs[0] case 2: r0, r1 := regs[0], regs[1] ADDQ(r1, r0) return r0 default: i := len(regs) / 2 r0 := divideAndConquerSum(regs[:i]) r1 := divideAndConquerSum(regs[i:]) ADDQ(r1, r0) return r0 } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/mem/index_pair_asm.go000066400000000000000000000205351452252572700270230ustar00rootroot00000000000000// +build ignore package main import ( "fmt" "math" . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" . "github.com/segmentio/asm/build/internal/x86" "github.com/segmentio/asm/cpu" ) func init() { ConstraintExpr("!purego") } func main() { generateIndexPair(indexPair1{}) generateIndexPair(indexPair2{}) generateIndexPair(indexPair4{}) generateIndexPair(indexPair8{}) generateIndexPair(indexPair16{}) generateIndexPair(indexPair32{}) Generate() } type indexPair interface { size() int test(a, b Mem) } type indexPairAVX2 interface { indexPair vpcmpeq(src0, src1, dst VecVirtual) vpmovmskb(tmp, src VecVirtual, spare, dst Register) } type indexPair1 struct{} func (indexPair1) size() int { return 1 } func (indexPair1) test(a, b Mem) { generateIndexPairTest(MOVB, CMPB, GP8, a, b) } func (indexPair1) vpcmpeq(a, b, c VecVirtual) { VPCMPEQB(a, b, c) } func (indexPair1) vpmovmskb(_, a VecVirtual, _, b Register) { VPMOVMSKB(a, b) } type indexPair2 struct{} func (indexPair2) size() int { return 2 } func (indexPair2) test(a, b Mem) { generateIndexPairTest(MOVW, CMPW, GP16, a, b) } func (indexPair2) vpcmpeq(a, b, c VecVirtual) { VPCMPEQW(a, b, c) } func (indexPair2) vpmovmskb(_, a VecVirtual, _, b Register) { VPMOVMSKB(a, b) } type indexPair4 struct{} func (indexPair4) size() int { return 4 } func (indexPair4) test(a, b Mem) { generateIndexPairTest(MOVL, CMPL, GP32, a, b) } func (indexPair4) vpcmpeq(a, b, c VecVirtual) { VPCMPEQD(a, b, c) } func (indexPair4) vpmovmskb(_, a VecVirtual, _, b Register) { VPMOVMSKB(a, b) } type indexPair8 struct{} func (indexPair8) size() int { return 8 } func (indexPair8) test(a, b Mem) { generateIndexPairTest(MOVQ, CMPQ, GP64, a, b) } func (indexPair8) vpcmpeq(a, b, c VecVirtual) { VPCMPEQQ(a, b, c) } func (indexPair8) vpmovmskb(_, a VecVirtual, _, b Register) { VPMOVMSKB(a, b) } type indexPair16 struct{} func (indexPair16) size() int { return 16 } func (indexPair16) test(a, b Mem) { r0, r1 := XMM(), XMM() MOVOU(a, r0) MOVOU(b, r1) mask := GP32() PCMPEQQ(r0, r1) PMOVMSKB(r1, mask) CMPL(mask, U32(0xFFFF)) } func (indexPair16) vpcmpeq(a, b, c VecVirtual) { VPCMPEQQ(a, b, c) } func (indexPair16) vpmovmskb(tmp, src VecVirtual, _, dst Register) { // https://www.felixcloutier.com/x86/vpermq#vpermq--vex-256-encoded-version- // // Swap each quad word in the lower and upper half of the 32 bytes register, // then AND the src and tmp registers to zero each halves that were partial // equality; only fully equal 128 bits need to result in setting 1 bits in // the destination mask. const permutation = (1 << 0) | (0 << 2) | (3 << 4) | (2 << 6) VPERMQ(Imm(permutation), src, tmp) VPAND(src, tmp, tmp) VPMOVMSKB(tmp, dst) } type indexPair32 struct{} func (indexPair32) size() int { return 32 } func (indexPair32) test(a, b Mem) { r0, r1, r2, r3 := XMM(), XMM(), XMM(), XMM() MOVOU(a, r0) MOVOU(a.Offset(16), r1) MOVOU(b, r2) MOVOU(b.Offset(16), r3) mask0, mask1 := GP32(), GP32() PCMPEQQ(r0, r2) PCMPEQQ(r1, r3) PMOVMSKB(r2, mask0) PMOVMSKB(r3, mask1) ANDL(mask1, mask0) CMPL(mask0, U32(0xFFFF)) } func (indexPair32) vpcmpeq(a, b, c VecVirtual) { VPCMPEQQ(a, b, c) } func (indexPair32) vpmovmskb(_, src VecVirtual, zero, dst Register) { VPMOVMSKB(src, dst) CMPL(dst, U32(0xFFFFFFFF)) CMOVLNE(zero, dst) } func generateIndexPair(code indexPair) { size := code.size() TEXT(fmt.Sprintf("indexPair%d", size), NOSPLIT, "func(b []byte) int") p := Load(Param("b").Base(), GP64()) n := Load(Param("b").Len(), GP64()) base := GP64() MOVQ(p, base) CMPQ(n, Imm(0)) JLE(LabelRef("fail")) SUBQ(Imm(uint64(size)), n) if _, ok := code.(indexPairAVX2); ok { JumpIfFeature("avx2", cpu.AVX2) } Label("tail") CMPQ(n, Imm(0)) JE(LabelRef("fail")) Label("generic") code.test(Mem{Base: p}, (Mem{Base: p}).Offset(size)) JE(LabelRef("done")) ADDQ(Imm(uint64(size)), p) SUBQ(Imm(uint64(size)), n) CMPQ(n, Imm(0)) JA(LabelRef("generic")) index := p Label("fail") MOVQ(U64(math.MaxUint64), index) Store(index, ReturnIndex(0)) RET() Label("done") // The delta between the base pointer and how far we advanced is the index of the pair. SUBQ(base, index) Store(index, ReturnIndex(0)) RET() if avx, ok := code.(indexPairAVX2); ok { const avxChunk = 256 const avxLanes = avxChunk / 32 Label("avx2") CMPQ(n, U32(avxChunk+uint64(size))) JB(LabelRef(fmt.Sprintf("avx2_tail%d", avxChunk/2))) masks := make([]GPVirtual, avxLanes) for i := range masks { masks[i] = GP64() XORQ(masks[i], masks[i]) } regA := make([]VecVirtual, avxLanes) regB := make([]VecVirtual, avxLanes) for i := range regA { regA[i] = YMM() regB[i] = YMM() } Label(fmt.Sprintf("avx2_loop%d", avxChunk)) generateIndexPairAVX2(p, regA, regB, masks, avx) ADDQ(U32(avxChunk), p) SUBQ(U32(avxChunk), n) CMPQ(n, U32(avxChunk+uint64(size))) JAE(LabelRef(fmt.Sprintf("avx2_loop%d", avxChunk))) for chunk := avxChunk / 2; chunk >= 32; chunk /= 2 { Label(fmt.Sprintf("avx2_tail%d", chunk)) CMPQ(n, Imm(uint64(chunk+size))) JB(LabelRef(fmt.Sprintf("avx2_tail%d", chunk/2))) lanes := chunk / 32 generateIndexPairAVX2(p, regA[:lanes], regB[:lanes], masks[:lanes], avx) ADDQ(U32(uint64(chunk)), p) SUBQ(U32(uint64(chunk)), n) } Label("avx2_tail16") if size < 16 { CMPQ(n, Imm(uint64(16+size))) JB(LabelRef("avx2_tail")) generateIndexPairAVX2(p, []VecVirtual{XMM()}, []VecVirtual{XMM()}, masks[:1], avx) ADDQ(Imm(16), p) SUBQ(Imm(16), n) } Label("avx2_tail") VZEROUPPER() JMP(LabelRef("tail")) Label("avx2_done") VZEROUPPER() for i, mask := range masks { CMPQ(mask, Imm(0)) JNE(LabelRef(fmt.Sprintf("avx2_done%d", i))) } for i, mask := range masks { Label(fmt.Sprintf("avx2_done%d", i)) if i > 0 { ADDQ(U32(uint64(i*32)), p) SUBQ(U32(uint64(i*32)), n) } TZCNTQ(mask, mask) ADDQ(mask, p) SUBQ(mask, n) JMP(LabelRef("done")) } } } func generateIndexPairTest(mov func(Op, Op), cmp func(Op, Op), reg func() GPVirtual, a, b Mem) { r := reg() mov(a, r) cmp(r, b) } func generateIndexPairAVX2(p Register, regA, regB []VecVirtual, masks []GPVirtual, code indexPairAVX2) { size := code.size() moves := make(map[int]VecVirtual) spare := GP64() if size == 32 { // This is a bit of an implicit coupling to the 32 bytes specialication, // but it did not seem worth the extra complexity to have more // abstractions. // // The spare register is passed to vpmovmskb and must be initialized to // zero as it may be used to clear the mask register. XORQ(spare, spare) } for i, reg := range regA { VMOVDQU((Mem{Base: p}).Offset(i*32), reg) moves[i*32] = reg } for i, reg := range regB { // Skip loading from memory a second time if we already loaded the // offset in the previous loop. This optimization applies for items // of size 32. if moves[i*32+size] == nil { lo := moves[i*32+(size-16)] hi := moves[i*32+(size+16)] if lo != nil && hi != nil { // https://www.felixcloutier.com/x86/vperm2i128#vperm2i128 // // The data was already loaded, but split across two registers. // We recompose it using a permutation of the upper and lower // halves of the registers holding the contiguous data. // // Note that in Go assembly the arguments are reversed; // SRC1 is `lo` and SRC2 is `hi`, but we pass them in the // reverse order. const permutation = (1 << 0) | (2 << 4) VPERM2I128(Imm(permutation), hi, lo, reg) } else { VMOVDQU((Mem{Base: p}).Offset(i*32+size), reg) } } } for i := range regA { // The load may have been elided if there was offset overlaps between // the two sources. if mov := moves[i*32+size]; mov != nil { code.vpcmpeq(regA[i], mov, regB[i]) } else { code.vpcmpeq(regA[i], regB[i], regB[i]) } } for i := range regB { code.vpmovmskb(regA[i], regB[i], spare.As32(), masks[i].As32()) } combinedMask := spare if len(masks) == 1 { combinedMask = masks[0] } else { XORQ(combinedMask, combinedMask) for _, mask := range masks { ORQ(mask, combinedMask) } } CMPQ(combinedMask, Imm(0)) JNE(LabelRef("avx2_done")) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/mem/mask_asm.go000066400000000000000000000005731452252572700256340ustar00rootroot00000000000000// +build ignore package main import ( . "github.com/mmcloughlin/avo/build" "github.com/segmentio/asm/build/internal/x86" ) func init() { ConstraintExpr("!purego") } func main() { x86.GenerateCopy("Mask", "set bits of dst to zero and copies the one-bits of src to dst, returning the number of bytes written.", x86.BinaryOpTable(ANDB, ANDW, ANDL, ANDQ, PAND, VPAND)) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/qsort/000077500000000000000000000000001452252572700240775ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/qsort/sort_asm.go000066400000000000000000000207771452252572700262720ustar00rootroot00000000000000// +build !amd64 package main import ( "fmt" "math" . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" ) func init() { ConstraintExpr("!purego") } func main() { distributeForward(&SortableScalar{reg: GP64, size: 8, mov: MOVQ, cmp: CMPQ}) distributeBackward(&SortableScalar{reg: GP64, size: 8, mov: MOVQ, cmp: CMPQ}) insertionsort(&SortableVector{reg: XMM, size: 16}) distributeForward(&SortableVector{reg: XMM, size: 16}) distributeBackward(&SortableVector{reg: XMM, size: 16}) insertionsort(&SortableVector{reg: YMM, size: 32}) distributeForward(&SortableVector{reg: YMM, size: 32}) distributeBackward(&SortableVector{reg: YMM, size: 32}) Generate() } type Sortable interface { Register() Register Size() uint64 Init() Move(Op, Op) Compare(Register, Register) } type SortableScalar struct { reg func() GPVirtual size uint64 mov func(Op, Op) cmp func(Op, Op) } func (s *SortableScalar) Register() Register { return s.reg() } func (s *SortableScalar) Size() uint64 { return s.size } func (s *SortableScalar) Init() {} func (s *SortableScalar) Move(a, b Op) { s.mov(a, b) } func (s *SortableScalar) Compare(a, b Register) { s.cmp(a, b) } type SortableVector struct { reg func() VecVirtual size uint64 ones Register msb Register } func (s *SortableVector) Register() Register { return s.reg() } func (s *SortableVector) Size() uint64 { return s.size } func (s *SortableVector) Init() { s.ones = s.reg() VPCMPEQB(s.ones, s.ones, s.ones) s.msb = s.reg() VPSLLQ(Imm(63), s.ones, s.msb) } func (s *SortableVector) Move(a, b Op) { VMOVDQU(a, b) } func (s *SortableVector) Compare(a, b Register) { // The following is a routine for vectors that yields the same ZF/CF // result as a CMP instruction. // First compare each packed qword for equality. eq := s.Register() VPCMPEQQ(a, b, eq) // SSE4.2 and AVX2 have a CMPGTQ to compare packed qwords, but // unfortunately it's a signed comparison. We know that u64 has // range [0,2^64-1] and signed (two's complement) i64 has range // [-2^63,2^63-1]. We can add (or subtract) 2^63 to each packed // unsigned qword and reinterpret each as a signed qword. Doing so // allows us to utilize a signed comparison, and yields the same // result as if we were doing an unsigned comparison with the input. // As usual, AVX-512 fixes the problem with its VPCMPUQ. lt := s.Register() aSigned := s.Register() bSigned := s.Register() VPADDQ(a, s.msb, aSigned) VPADDQ(b, s.msb, bSigned) VPCMPGTQ(aSigned, bSigned, lt) // Extract bit masks. eqMask := GP32() ltMask := GP32() VMOVMSKPD(eq, eqMask) VMOVMSKPD(lt, ltMask) // Invert the equality mask to find qwords that weren't equal. // Bit-scan forward to find the first unequal byte, then test // that bit in the less-than mask. NOTL(eqMask) unequalByteIndex := GP32() BSFL(eqMask, unequalByteIndex) // set ZF BTSL(unequalByteIndex, ltMask) // set CF } func insertionsort(s Sortable) { size := s.Size() TEXT(fmt.Sprintf("insertionsort%dNoSwap", size*8), NOSPLIT, fmt.Sprintf("func(data []%s, base int, swap func(int, int))", typeFor(size))) Pragma("noescape") data := Load(Param("data").Base(), GP64()) end := Load(Param("data").Len(), GP64()) shift := log2(size) SHLQ(Imm(shift), end) ADDQ(data, end) TESTQ(data, end) JE(LabelRef("done")) s.Init() i := GP64() MOVQ(data, i) Label("outer") ADDQ(Imm(size), i) CMPQ(i, end) JAE(LabelRef("done")) item := s.Register() s.Move(Mem{Base: i}, item) j := GP64() MOVQ(i, j) Label("inner") prev := s.Register() s.Move(Mem{Base: j, Disp: -int(size)}, prev) s.Compare(item, prev) JAE(LabelRef("outer")) s.Move(prev, Mem{Base: j}) s.Move(item, Mem{Base: j, Disp: -int(size)}) SUBQ(Imm(size), j) CMPQ(j, data) JA(LabelRef("inner")) JMP(LabelRef("outer")) Label("done") if size > 16 { VZEROUPPER() } RET() } func distributeForward(s Sortable) { size := s.Size() TEXT(fmt.Sprintf("distributeForward%d", size*8), NOSPLIT, fmt.Sprintf("func(data, scratch []%s, limit, lo, hi int) int", typeFor(size))) Pragma("noescape") // Load inputs. data := Load(Param("data").Base(), GP64()) scratch := Load(Param("scratch").Base(), GP64()) limit := Load(Param("limit"), GP64()) loIndex := Load(Param("lo"), GP64()) hiIndex := Load(Param("hi"), GP64()) // Convert indices to byte offsets if necessary. We can use indices // only if the size is a valid scale (1/2/4/8). shift := log2(size) var scale uint8 if size <= 8 { scale = uint8(size) } else { scale = 1 SHLQ(Imm(shift), limit) SHLQ(Imm(shift), loIndex) SHLQ(Imm(shift), hiIndex) } // Prepare read/cmp pointers. lo := GP64() hi := GP64() tail := GP64() LEAQ(Mem{Base: data, Index: loIndex, Scale: scale}, lo) LEAQ(Mem{Base: data, Index: hiIndex, Scale: scale}, hi) LEAQ(Mem{Base: scratch, Index: limit, Scale: scale, Disp: -int(size)}, tail) s.Init() // Load the pivot item. pivot := s.Register() s.Move(Mem{Base: data}, pivot) offset := GP64() zero := GP64() XORQ(offset, offset) XORQ(zero, zero) isGreaterOrEqual := zero // We'll be keeping a negative offset. Negate the limit so we can // compare the two in the loop. NEGQ(limit) Label("loop") // Load the next item. next := s.Register() s.Move(Mem{Base: lo}, next) // Compare the item with the pivot. s.Compare(next, pivot) // Conditionally write to either the beginning of the data slice, or // end of the scratch slice. dst := GP64() MOVQ(lo, dst) CMOVQCC(tail, dst) s.Move(next, Mem{Base: dst, Index: offset, Scale: scale}) if size <= 8 { // If we're only subtracting one from the index, we can invert CF and use // subtract with carry. CMC() SBBQ(zero, offset) } else { // Otherwise we need to extract an inverted CF and shift to get a byte // amount to advance by. SETCC(isGreaterOrEqual.As8()) SHLQ(Imm(shift), isGreaterOrEqual) SUBQ(isGreaterOrEqual, offset) } ADDQ(Imm(size), lo) // Loop while we have more input, and enough room in the scratch slice. CMPQ(lo, hi) JA(LabelRef("done")) CMPQ(offset, limit) JNE(LabelRef("loop")) // Return the number of items written to the data slice. Label("done") SUBQ(data, lo) LEAQ(Mem{Base: lo, Index: offset, Scale: scale}, lo) SHRQ(Imm(shift), lo) DECQ(lo) Store(lo, ReturnIndex(0)) if size > 16 { VZEROUPPER() } RET() } func distributeBackward(s Sortable) { size := s.Size() TEXT(fmt.Sprintf("distributeBackward%d", size*8), NOSPLIT, fmt.Sprintf("func(data, scratch []%s, limit, lo, hi int) int", typeFor(size))) Pragma("noescape") // Load inputs. data := Load(Param("data").Base(), GP64()) scratch := Load(Param("scratch").Base(), GP64()) limit := Load(Param("limit"), GP64()) loIndex := Load(Param("lo"), GP64()) hiIndex := Load(Param("hi"), GP64()) // Convert indices to byte offsets if necessary. We can use indices // only if the size is a valid scale (1/2/4/8). shift := log2(size) var scale uint8 if size <= 8 { scale = uint8(size) } else { scale = 1 SHLQ(Imm(shift), limit) SHLQ(Imm(shift), loIndex) SHLQ(Imm(shift), hiIndex) } // Prepare read/cmp pointers. lo := GP64() hi := GP64() LEAQ(Mem{Base: data, Index: loIndex, Scale: scale}, lo) LEAQ(Mem{Base: data, Index: hiIndex, Scale: scale}, hi) s.Init() // Load the pivot item. pivot := s.Register() s.Move(Mem{Base: data}, pivot) offset := GP64() zero := GP64() XORQ(offset, offset) XORQ(zero, zero) isLess := zero CMPQ(hi, lo) JBE(LabelRef("done")) Label("loop") // Load the next item. next := s.Register() s.Move(Mem{Base: hi}, next) // Compare the item with the pivot. s.Compare(next, pivot) // Conditionally write to either the end of the data slice, or // beginning of the scratch slice. dst := GP64() MOVQ(scratch, dst) CMOVQCC(hi, dst) s.Move(next, Mem{Base: dst, Index: offset, Scale: scale}) if size <= 8 { ADCQ(zero, offset) } else { SETCS(isLess.As8()) SHLQ(Imm(shift), isLess) ADDQ(isLess, offset) } SUBQ(Imm(size), hi) // Loop while we have more input, and enough room in the scratch slice. CMPQ(hi, lo) JBE(LabelRef("done")) CMPQ(offset, limit) JNE(LabelRef("loop")) // Return the number of items written to the data slice. Label("done") SUBQ(data, hi) LEAQ(Mem{Base: hi, Index: offset, Scale: scale}, hi) SHRQ(Imm(shift), hi) Store(hi, ReturnIndex(0)) if size > 16 { VZEROUPPER() } RET() } func log2(size uint64) uint64 { return uint64(math.Log2(float64(size))) } func typeFor(size uint64) string { switch size { case 32: return "struct { a, b, c, d uint64 }" case 16: return "struct { hi, lo uint64 }" default: return fmt.Sprintf("uint%d", size*8) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/slices/000077500000000000000000000000001452252572700242115ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/slices/sums_asm.go000066400000000000000000000066021452252572700263730ustar00rootroot00000000000000// +build ignore package main import ( "fmt" . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/segmentio/asm/build/internal/x86" "github.com/mmcloughlin/avo/reg" "github.com/segmentio/asm/cpu" ) const unroll = 8 type Processor struct { name string typ string scale uint8 avxOffset uint64 avxAdd func(...Op) x86Mov func(imr, mr Op) x86Add func(imr, amr Op) x86Reg reg.GPVirtual } func init() { ConstraintExpr("!purego") } func main() { generate(Processor{ name: "sumUint64", typ: "uint64", scale: 8, avxOffset: 2, avxAdd: VPADDQ, x86Mov: MOVQ, x86Add: ADDQ, x86Reg: GP64(), }) generate(Processor{ name: "sumUint32", typ: "uint32", scale: 4, avxOffset: 4, avxAdd: VPADDD, x86Mov: MOVL, x86Add: ADDL, x86Reg: GP32(), }) generate(Processor{ name: "sumUint16", typ: "uint16", scale: 2, avxOffset: 8, avxAdd: VPADDW, x86Mov: MOVW, x86Add: ADDW, x86Reg: GP16(), }) generate(Processor{ name: "sumUint8", typ: "uint8", scale: 1, avxOffset: 16, avxAdd: VPADDB, x86Mov: MOVB, x86Add: ADDB, x86Reg: GP8(), }) Generate() } func generate(p Processor) { TEXT(p.name, NOSPLIT, fmt.Sprintf("func(x, y []%s)", p.typ)) Doc(fmt.Sprintf("Sum %ss using avx2 instructions, results stored in x", p.typ)) idx := GP64() XORQ(idx, idx) xPtr := Mem{Base: Load(Param("x").Base(), GP64()), Index: idx, Scale: p.scale} yPtr := Mem{Base: Load(Param("y").Base(), GP64()), Index: idx, Scale: p.scale} len := Load(Param("x").Len(), GP64()) yLen := Load(Param("y").Len(), GP64()) // len = min(len(x), len(y)) CMPQ(yLen, len) CMOVQLT(yLen, len) JumpUnlessFeature("x86_loop", cpu.AVX2) Label("avx2_loop") next := GP64() MOVQ(idx, next) ADDQ(Imm(unroll*p.avxOffset), next) CMPQ(next, len) JAE(LabelRef("x86_loop")) // Create unroll num vector registers var vectors [unroll]reg.VecVirtual for i := 0; i < unroll; i++ { vectors[i] = YMM() } // So here essentially what we're doing is populating pairs // of vector registers with 256 bits of integer data, so as an example // for uint64s, it would look like... // YMM0 [ x0, x1, x2, x3 ] // YMM1 [ y0, y1, y2, y3 ] // ... // YMM(N) ... // // We then use VPADDQ to perform a SIMD addition operation // on the pairs and the result is stored in even register (0,2,4...). // Finally we copy the results back out to the slice pointed to by x for offset, i := 0, 0; i < unroll/2; i++ { VMOVDQU(xPtr.Offset(i*32), vectors[offset]) VMOVDQU(yPtr.Offset(i*32), vectors[offset+1]) offset += 2 } // AVX intrinsics to sum 64 bit integers/quad words for offset, i := 0, 0; i < unroll/2; i++ { p.avxAdd(vectors[offset], vectors[offset+1], vectors[offset]) offset += 2 } for offset, i := 0, 0; i < unroll/2; i++ { VMOVDQU(vectors[offset], xPtr.Offset(i*32)) offset += 2 } // Increment ptrs and loop. MOVQ(next, idx) JMP(LabelRef("avx2_loop")) // Here's we're just going to manually bump our pointers // and do a the addition on the remaining integers (if any) Label("x86_loop") CMPQ(idx, len) JAE(LabelRef("return")) // Delegate to specific computation //calc() p.x86Mov(yPtr, p.x86Reg) p.x86Add(p.x86Reg, xPtr) // Increment ptrs and loop. ADDQ(Imm(1), idx) JMP(LabelRef("x86_loop")) Label("return") RET() } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/sortedset/000077500000000000000000000000001452252572700247435ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/sortedset/dedupe_asm.go000066400000000000000000000300421452252572700273770ustar00rootroot00000000000000// +build ignore package main import ( "fmt" "math/bits" . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" . "github.com/segmentio/asm/build/internal/asm" . "github.com/segmentio/asm/build/internal/x86" "github.com/segmentio/asm/cpu" ) func init() { ConstraintExpr("!purego") } func main() { generateDedupe(new(dedupe1)) generateDedupe(new(dedupe2)) generateDedupe(new(dedupe4)) generateDedupe(new(dedupe8)) generateDedupe(new(dedupe16)) generateDedupe(new(dedupe32)) Generate() } type dedupe interface { size() int init(p, w GPVirtual) copy(p, w GPVirtual) } type dedupeAVX2 interface { dedupe vec() VecVirtual vsize() int vlanes() int vinit(p, w GPVirtual) vcopy(src0, src1, dst VecVirtual, off GPVirtual) } type dedupe1 struct{} func (*dedupe1) size() int { return 1 } func (*dedupe1) init(p, w GPVirtual) { move(MOVB, GP8(), p, w) } func (*dedupe1) copy(p, w GPVirtual) { generateDedupeX86(MOVB, CMPB, GP8, p, w, 1) } type dedupe2 struct{} func (*dedupe2) size() int { return 2 } func (*dedupe2) init(p, w GPVirtual) { move(MOVW, GP16(), p, w) } func (*dedupe2) copy(p, w GPVirtual) { generateDedupeX86(MOVW, CMPW, GP16, p, w, 2) } type dedupe4 struct { shuf GPVirtual incr GPVirtual } func (*dedupe4) size() int { return 4 } func (*dedupe4) init(p, w GPVirtual) { move(MOVL, GP32(), p, w) } func (*dedupe4) copy(p, w GPVirtual) { generateDedupeX86(MOVL, CMPL, GP32, p, w, 4) } func (d *dedupe4) vec() VecVirtual { return XMM() } func (d *dedupe4) vsize() int { return 16 } func (d *dedupe4) vlanes() int { return 8 } func (d *dedupe4) vinit(p, w GPVirtual) { move(MOVL, GP32(), p, w) d.shuf = GP64() LEAQ( ConstShuffleMask32("dedupe4_shuffle_mask", 0, 1, 2, 3, // 0b0000 1, 2, 3, 0, // 0b0001 0, 2, 3, 1, // 0b0010 2, 3, 0, 1, // 0b0011 0, 1, 3, 2, // 0b0100 1, 3, 0, 2, // 0b0101 0, 3, 1, 2, // 0b0110 3, 0, 1, 2, // 0b0111 0, 1, 2, 3, // 0b1000 1, 2, 0, 3, // 0b1001 0, 3, 1, 2, // 0b1010 2, 0, 1, 3, // 0b1011 0, 1, 2, 3, // 0b1100 1, 0, 2, 3, // 0b1101 0, 1, 2, 3, // 0b1110 0, 1, 2, 3, // 0b1111 ), d.shuf, ) d.incr = GP64() LEAQ( // A table indexing the number of bytes to advance the write pointer by, // depending on how many 4 bytes items were equal. ConstArray32("dedupe4_offset_array", // 0b0000, 0b0001, 0b0010, 0b0011 16, 12, 12, 8, // 0b0100, 0b0101, 0b0110, 0b0111 12, 8, 8, 4, // 0b1000, 0b1001, 0b1010, 0b1011 12, 8, 8, 4, // 0b1100, 0b1101, 0b1110, 0b1111 8, 4, 4, 0, ), d.incr, ) } func (d *dedupe4) vcopy(src0, src1, dst VecVirtual, off GPVirtual) { VPCMPEQD(src1, src0, src0) VMOVMSKPS(src0, off.As32()) // 16 possible states: // * 0b0000 // * 0b0001 // * 0b0010 // * 0b0011 // * ... // * 0b1111 // We multiply the mask by 4 (left shift 2) to use the value as index into // the shuffle mask table (128 bits) and offset array (32 bits). SHLQ(Imm(2), off) VPSHUFB(Mem{Base: d.shuf}.Idx(off, 4), src1, dst) MOVL(Mem{Base: d.incr}.Idx(off, 1), off.As32()) } type dedupe8 struct { shuf GPVirtual incr GPVirtual } func (*dedupe8) size() int { return 8 } func (*dedupe8) init(p, w GPVirtual) { move(MOVQ, GP64(), p, w) } func (*dedupe8) copy(p, w GPVirtual) { generateDedupeX86(MOVQ, CMPQ, GP64, p, w, 8) } func (*dedupe8) vec() VecVirtual { return XMM() } func (*dedupe8) vsize() int { return 16 } func (*dedupe8) vlanes() int { return 8 } func (d *dedupe8) vinit(p, w GPVirtual) { move(MOVQ, GP64(), p, w) d.shuf = GP64() d.incr = GP64() LEAQ( ConstShuffleMask64("dedupe8_shuffle_mask", // We use the interesting property that the first and second masks // overlap on their respective upper and lower 64 bits to use a // shuffle mask of 64 bits elements. // // This technique saves a shift instruction in the vcopy // implementation which would otherwise be required to convert the // bit mask values (0, 1, 2, 3) to indices into an array of 128 bits // elements (since only 1, 2, 4, and 8 scales are supported). // // This is the layout: // * (0b00 x 8)[128:0] => [0, 1]; copy all 128 bits // * (0b01 x 8)[128:0] => [1, 0]; copy the upper 64 bits (lower 64 bits are discarded) // * (0b10 x 8)[128:0] => [0, 0]; copy the lower 64 bits (upper 64 bits are discarded) // * (0b11 x 8)[128:0] => [0, 0]; all 128 bits are discarded 0, 1, 0, 0, 0, ), d.shuf, ) LEAQ( ConstArray64("dedupe8_offset_array", 16, 8, 8, 0), d.incr, ) } func (d *dedupe8) vcopy(src0, src1, dst VecVirtual, off GPVirtual) { VPCMPEQQ(src1, src0, src0) VMOVMSKPD(src0, off.As32()) VPSHUFB(Mem{Base: d.shuf}.Idx(off, 8), src1, dst) MOVQ(Mem{Base: d.incr}.Idx(off, 8), off) } type dedupe16 struct { nop GPVirtual inc GPVirtual } func (*dedupe16) size() int { return 16 } func (*dedupe16) init(p, w GPVirtual) { move(MOVOU, XMM(), p, w) } func (*dedupe16) copy(p, w GPVirtual) { next := GP64() MOVQ(w, next) ADDQ(Imm(16), next) xmm0, xmm1 := XMM(), XMM() MOVOU(Mem{Base: p}, xmm0) MOVOU(Mem{Base: p}.Offset(16), xmm1) MOVOU(xmm1, Mem{Base: w}) mask := GP32() PCMPEQQ(xmm0, xmm1) PMOVMSKB(xmm1, mask) CMPL(mask, U32(0xFFFF)) CMOVQNE(next, w) } func (*dedupe16) vec() VecVirtual { return XMM() } func (*dedupe16) vsize() int { return 16 } func (*dedupe16) vlanes() int { return 8 } func (d *dedupe16) vinit(p, w GPVirtual) { move(VMOVDQU, XMM(), p, w) d.nop = GP64() d.inc = GP64() XORQ(d.nop, d.nop) MOVQ(U64(16), d.inc) } func (d *dedupe16) vcopy(src0, src1, dst VecVirtual, off GPVirtual) { if src1 != dst { VMOVDQA(src1, dst) } VPCMPEQQ(src1, src0, src0) // This gives a bitmask with these possible values: // * 0b00 // * 0b01 // * 0b10 // * 0b11 // We only care about the last case, which indicates that both 64 bits lanes // of the XMM register were equal. VMOVMSKPD(src0, off.As32()) CMPQ(off, Imm(3)) CMOVQEQ(d.nop, off) CMOVQNE(d.inc, off) } type dedupe32 struct { nop GPVirtual inc GPVirtual } func (*dedupe32) size() int { return 32 } func (*dedupe32) init(p, w GPVirtual) { lo, hi := XMM(), XMM() MOVOU(Mem{Base: p}, lo) MOVOU(Mem{Base: p}.Offset(16), hi) MOVOU(lo, Mem{Base: w}) MOVOU(hi, Mem{Base: w}.Offset(16)) } func (*dedupe32) copy(p, w GPVirtual) { next := GP64() MOVQ(w, next) ADDQ(Imm(32), next) loP, hiP := XMM(), XMM() loQ, hiQ := XMM(), XMM() MOVOU(Mem{Base: p}, loP) MOVOU(Mem{Base: p}.Offset(16), hiP) MOVOU(Mem{Base: p}.Offset(32), loQ) MOVOU(Mem{Base: p}.Offset(48), hiQ) MOVOU(loQ, Mem{Base: w}) MOVOU(hiQ, Mem{Base: w}.Offset(16)) mask0, mask1 := GP32(), GP32() PCMPEQQ(loP, loQ) PCMPEQQ(hiP, hiQ) PMOVMSKB(loQ, mask0) PMOVMSKB(hiQ, mask1) ANDL(mask1, mask0) CMPL(mask0, U32(0xFFFF)) CMOVQNE(next, w) } func (*dedupe32) vec() VecVirtual { return YMM() } func (*dedupe32) vsize() int { return 32 } func (*dedupe32) vlanes() int { return 8 } func (d *dedupe32) vinit(p, w GPVirtual) { move(VMOVDQU, YMM(), p, w) d.nop = GP64() d.inc = GP64() XORQ(d.nop, d.nop) MOVQ(U64(32), d.inc) } func (d *dedupe32) vcopy(src0, src1, dst VecVirtual, off GPVirtual) { if src1 != dst { VMOVDQA(src1, dst) } VPCMPEQQ(src1, src0, src0) // This gives a bitmask with these possible values: // * 0b0000 // * 0b0001 // * ... // * 0b1111 // // We only care about the last case because it indicates that the full 32 // bytes are equal. // // We want to divide by 15, which will either produce a result of 0 or 1. // Rather than dividing, we add 1 and shift right by 4. VMOVMSKPD(src0, off.As32()) CMPQ(off, Imm(15)) CMOVQEQ(d.nop, off) CMOVQNE(d.inc, off) } func generateDedupe(dedupe dedupe) { size := dedupe.size() TEXT(fmt.Sprintf("dedupe%d", size), NOSPLIT, "func(dst, src []byte) int") n := Load(Param("src").Len(), GP64()) CMPQ(n, Imm(0)) JE(LabelRef("short")) dst := Load(Param("dst").Base(), GP64()) src := Load(Param("src").Base(), GP64()) // `p` is the read pointer that will be advanced through the input array // testing for equal pairs. // // `w` points to the position in the output buffer where the next item // is to be written. p := GP64() w := GP64() MOVQ(src, p) MOVQ(dst, w) SUBQ(Imm(uint64(size)), n) if avx, ok := dedupe.(dedupeAVX2); ok { CMPQ(n, Imm(uint64(avx.vsize()))) JL(LabelRef("init")) JumpIfFeature("avx2", cpu.AVX2) } Label("init") dedupe.init(p, w) ADDQ(Imm(uint64(size)), w) Label("tail") CMPQ(n, Imm(0)) JE(LabelRef("done")) Label("generic") dedupe.copy(p, w) ADDQ(Imm(uint64(size)), p) SUBQ(Imm(uint64(size)), n) CMPQ(n, Imm(0)) JG(LabelRef("generic")) Label("done") SUBQ(dst, w) Store(w, ReturnIndex(0)) RET() Label("short") Store(n, ReturnIndex(0)) RET() if avx, ok := dedupe.(dedupeAVX2); ok { avxLanes := avx.vlanes() avxChunk := avx.vsize() * avxLanes Label("avx2") src := make([]VecVirtual, avxLanes) dst := make([]VecVirtual, avxLanes) off := make([]GPVirtual, avxLanes) for i := range src { src[i] = avx.vec() dst[i] = avx.vec() off[i] = GP64() } avx.vinit(p, w) ADDQ(Imm(uint64(size)), w) // This bit of magic aligns the tail chunk size on the first power of // two smaller than the chunk size used in the loop. // // This is useful when the number of lanes in not a power of two. tailChunk := 1 << (63 - bits.LeadingZeros(uint(avxChunk))) if tailChunk == avxChunk { tailChunk /= 2 } CMPQ(n, U32(avxChunk)) if tailChunk >= avx.vsize() { JL(LabelRef(fmt.Sprintf("avx2_tail%d", tailChunk))) } else { JL(LabelRef("avx2_tail")) } Label(fmt.Sprintf("avx2_loop%d", avxChunk)) generateDedupeAVX2(p, w, src, dst, off, avx) ADDQ(U32(uint64(avxChunk)), p) SUBQ(U32(uint64(avxChunk)), n) CMPQ(n, U32(avxChunk)) JGE(LabelRef(fmt.Sprintf("avx2_loop%d", avxChunk))) for chunk := tailChunk; chunk >= avx.vsize(); chunk /= 2 { Label(fmt.Sprintf("avx2_tail%d", chunk)) CMPQ(n, Imm(uint64(chunk))) if next := chunk / 2; next >= avx.vsize() { JL(LabelRef(fmt.Sprintf("avx2_tail%d", chunk/2))) } else { JL(LabelRef("avx2_tail")) } lanes := chunk / avx.vsize() generateDedupeAVX2(p, w, src[:lanes], dst[:lanes], off[:lanes], avx) ADDQ(Imm(uint64(chunk)), p) SUBQ(Imm(uint64(chunk)), n) } Label("avx2_tail") VZEROUPPER() JMP(LabelRef("tail")) } } func generateDedupeX86(mov func(Op, Op), cmp func(Op, Op), reg func() GPVirtual, p, w GPVirtual, size int) { next := GP64() MOVQ(w, next) ADDQ(Imm(uint64(size)), next) r0, r1 := reg(), reg() mov(Mem{Base: p}, r0) mov(Mem{Base: p}.Offset(size), r1) mov(r1, Mem{Base: w}) cmp(r0, r1) CMOVQNE(next, w) } func generateDedupeAVX2(p, w GPVirtual, src, dst []VecVirtual, off []GPVirtual, dedupe dedupeAVX2) { size := dedupe.size() step := dedupe.vsize() moves := make(map[int]VecVirtual) for i := range src { VMOVDQU(Mem{Base: p}.Offset(i*step), src[i]) moves[i*step] = src[i] } reg := make([]VecVirtual, len(src)) for i := range dst { // Elide moves from memory if possible by reusing registers that // already contain the required data chunk. // // Care must be given in the implementations of AVX2 specializations // not to write to the second source, as it may unexpectedly mutate // the src0 or dst registers. if prev := moves[i*step+size]; prev != nil { reg[i] = prev } else { reg[i] = dst[i] VMOVDQU(Mem{Base: p}.Offset(i*step+size), dst[i]) } } for i := range src { dedupe.vcopy(src[i], reg[i], dst[i], off[i]) if i > 0 { // Compute the cumulative offsets so we can use indexes relative to the // write pointer, which allows the CPU to pipeline the writes to memory. // // There are still strong data dependencies between these instructions, // but I'm not sure there is a great alternative. Moving the values to a // vector register and using SIMD seems like a lost of heavy lifting for // the limited number of registers we have. ADDQ(off[i-1], off[i]) } } for i := range dst { if i == 0 { VMOVDQU(dst[i], Mem{Base: w}) } else { VMOVDQU(dst[i], Mem{Base: w}.Idx(off[i-1], 1)) } } ADDQ(off[len(off)-1], w) } func move(mov func(Op, Op), tmp Register, src, dst GPVirtual) { mov(Mem{Base: src}, tmp) mov(tmp, Mem{Base: dst}) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/build/sortedset/intersect16_asm.go000066400000000000000000000041621452252572700303040ustar00rootroot00000000000000// +build ignore package main import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" ) func init() { ConstraintExpr("!purego") } func main() { TEXT("intersect16", NOSPLIT, "func(dst, a, b []byte) int") // Load all pointers. dst := Load(Param("dst").Base(), GP64()) a := Load(Param("a").Base(), GP64()) b := Load(Param("b").Base(), GP64()) // Calculate the end of a/b so we know where to loop until. aEnd := Load(Param("a").Len(), GP64()) ADDQ(a, aEnd) bEnd := Load(Param("b").Len(), GP64()) ADDQ(b, bEnd) ones := XMM() VPCMPEQB(ones, ones, ones) // Load the first item from a/b. We know that each has at least // one item (this is enforced in the wrapper). aItem := XMM() bItem := XMM() VMOVUPS(Mem{Base: a}, aItem) VMOVUPS(Mem{Base: b}, bItem) Label("loop") // Compare bytes and extract two masks. // ne = mask of bytes where a!=b // lt = mask of bytes where aa, copy and advance a. Label("greater") VMOVUPS(bItem, Mem{Base: dst}) ADDQ(Imm(16), dst) ADDQ(Imm(16), b) CMPQ(b, bEnd) JE(LabelRef("done")) VMOVUPS(Mem{Base: b}, bItem) JMP(LabelRef("loop")) // If a TooLong, TooLong, TooLong, TooLong, TooLong, TooLong, TooLong, TooLong, // 10______ ________ TwoConts, TwoConts, TwoConts, TwoConts, // 1100____ ________ TooShort | Overlong2, // 1101____ ________ TooShort, // 1110____ ________ TooShort | Overlong3 | Surrogate, // 1111____ ________ TooShort | TooLarge | TooLarge1000 | Overlong4, }) nib2 = fullMask([16]byte{ // ____0000 ________ Carry | Overlong3 | Overlong2 | Overlong4, // ____0001 ________ Carry | Overlong2, // ____001_ ________ Carry, Carry, // ____0100 ________ Carry | TooLarge, // ____0101 ________ Carry | TooLarge | TooLarge1000, // ____011_ ________ Carry | TooLarge | TooLarge1000, Carry | TooLarge | TooLarge1000, // ____1___ ________ Carry | TooLarge | TooLarge1000, Carry | TooLarge | TooLarge1000, Carry | TooLarge | TooLarge1000, Carry | TooLarge | TooLarge1000, Carry | TooLarge | TooLarge1000, // ____1101 ________ Carry | TooLarge | TooLarge1000 | Surrogate, Carry | TooLarge | TooLarge1000, Carry | TooLarge | TooLarge1000, }) nib3 = fullMask([16]byte{ // ________ 0_______ TooShort, TooShort, TooShort, TooShort, TooShort, TooShort, TooShort, TooShort, // ________ 1000____ TooLong | Overlong2 | TwoConts | Overlong3 | TooLarge1000 | Overlong4, // ________ 1001____ TooLong | Overlong2 | TwoConts | Overlong3 | TooLarge, // ________ 101_____ TooLong | Overlong2 | TwoConts | Surrogate | TooLarge, TooLong | Overlong2 | TwoConts | Surrogate | TooLarge, // ________ 11______ TooShort, TooShort, TooShort, TooShort, }) return } func main() { TEXT("validateAvx", NOSPLIT, "func(p []byte) byte") Doc("Optimized version of Validate for inputs of more than 32B.") ret, err := ReturnIndex(0).Resolve() if err != nil { panic(err) } d := Load(Param("p").Base(), GP64()) n := Load(Param("p").Len(), GP64()) isAscii := GP8() MOVB(Imm(1), isAscii) Comment("Prepare the constant masks") incompleteMask := ConstBytes("incomplete_mask", incompleteMaskData()) incompleteMaskY := YMM() VMOVDQU(incompleteMask, incompleteMaskY) continuation4Bytes := ConstBytes("cont4_vec", continuationMaskData(0b11110000)) continuation4BytesY := YMM() VMOVDQU(continuation4Bytes, continuation4BytesY) continuation3Bytes := ConstBytes("cont3_vec", continuationMaskData(0b11100000)) continuation3BytesY := YMM() VMOVDQU(continuation3Bytes, continuation3BytesY) nib1Data, nib2Data, nib3Data := nibbleMasksData() Comment("High nibble of current byte") nibble1Errors := ConstBytes("nibble1_errors", nib1Data) nibble1Y := YMM() VMOVDQU(nibble1Errors, nibble1Y) Comment("Low nibble of current byte") nibble2Errors := ConstBytes("nibble2_errors", nib2Data) nibble2Y := YMM() VMOVDQU(nibble2Errors, nibble2Y) Comment("High nibble of the next byte") nibble3Errors := ConstBytes("nibble3_errors", nib3Data) nibble3Y := YMM() VMOVDQU(nibble3Errors, nibble3Y) Comment("Nibble mask") lowerNibbleMask := ConstArray64("nibble_mask", 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, ) nibbleMaskY := YMM() VMOVDQU(lowerNibbleMask, nibbleMaskY) Comment("MSB mask") msbMask := ConstArray64("msb_mask", 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, ) msbMaskY := YMM() VMOVDQU(msbMask, msbMaskY) Comment("For the first pass, set the previous block as zero.") previousBlockY := YMM() zeroOutVector(previousBlockY) Comment("Zeroes the error vector.") errorY := YMM() zeroOutVector(errorY) Comment(`Zeroes the "previous block was incomplete" vector.`) incompletePreviousBlockY := YMM() zeroOutVector(incompletePreviousBlockY) Comment("Top of the loop.") Label("check_input") currentBlockY := YMM() Comment("if bytes left >= 32") CMPQ(n, U8(32)) Comment("go process the next block") JGE(LabelRef("process")) Comment("If < 32 bytes left") Comment("Fast exit if done") CMPQ(n, U8(0)) JE(LabelRef("end")) // At this point we know we need to load up to 32 bytes of input to // finish the validation and pad the rest of the input vector with // zeroes. // // This code assumes that the remainder of the input data ends right // before a page boundary. As a result, we need to take special care to // avoid a page fault. // // At a high level: // // 1. Move back the data pointer so that the 32 bytes load ends exactly // where the input does. // // 2. Shift right the loaded input so that the remaining input starts at // the beginning of the vector. // // 3. Pad the rest of the vector with zeroes. // // Because AVX2 32 bytes vectors are really two 16 bytes vector, we need // to jump through hoops to perform the shift operation accross // lates. This code has two versions, one for inputs of less than 16 // bytes, and one for larger inputs. Though the latter as more steps, // they work using a shuffle mask to shift the bytes in the vector, and // a blend operation to stich together the various pieces of the // resulting vector. // // TODO: faster load code when not near a page boundary. Comment("If 0 < bytes left < 32") zeroes := YMM() VPXOR(zeroes, zeroes, zeroes) shuffleMaskBytes := make([]byte, 3*16) for i := byte(0); i < 16; i++ { shuffleMaskBytes[i] = i shuffleMaskBytes[i+16] = i shuffleMaskBytes[i+32] = i } shuffleMask := ConstBytes("shuffle_mask", shuffleMaskBytes) shuffleClearMaskBytes := make([]byte, 3*16) for i := byte(0); i < 16; i++ { shuffleClearMaskBytes[i] = i shuffleClearMaskBytes[i+16] = 0xFF shuffleClearMaskBytes[i+32] = 0xFF } shuffleClearMask := ConstBytes("shuffle_clear_mask", shuffleClearMaskBytes) offset := GP64() shuffleMaskPtr := GP64() shuffle := YMM() tmp1 := YMM() MOVQ(U64(32), offset) SUBQ(n, offset) SUBQ(offset, d) VMOVDQU(Mem{Base: d}, currentBlockY) CMPQ(n, U8(16)) JA(LabelRef("tail_load_large")) Comment("Shift right that works if remaining bytes <= 16, safe next to a page boundary") VPERM2I128(Imm(3), currentBlockY, zeroes, currentBlockY) LEAQ(shuffleClearMask.Offset(16), shuffleMaskPtr) ADDQ(n, offset) ADDQ(n, offset) SUBQ(Imm(32), offset) SUBQ(offset, shuffleMaskPtr) VMOVDQU(Mem{Base: shuffleMaskPtr}, shuffle) VPSHUFB(shuffle, currentBlockY, currentBlockY) XORQ(n, n) JMP(LabelRef("loaded")) Comment("Shift right that works if remaining bytes >= 16, safe next to a page boundary") Label("tail_load_large") ADDQ(n, offset) ADDQ(n, offset) SUBQ(Imm(48), offset) LEAQ(shuffleMask.Offset(16), shuffleMaskPtr) SUBQ(offset, shuffleMaskPtr) VMOVDQU(Mem{Base: shuffleMaskPtr}, shuffle) VPSHUFB(shuffle, currentBlockY, tmp1) tmp2 := YMM() VPERM2I128(Imm(3), currentBlockY, zeroes, tmp2) VPSHUFB(shuffle, tmp2, tmp2) blendMaskBytes := make([]byte, 3*16) for i := byte(0); i < 16; i++ { blendMaskBytes[i] = 0xFF blendMaskBytes[i+16] = 0x00 blendMaskBytes[i+32] = 0xFF } blendMask := ConstBytes("blend_mask", blendMaskBytes) blendMaskStartPtr := GP64() LEAQ(blendMask.Offset(16), blendMaskStartPtr) SUBQ(offset, blendMaskStartPtr) blend := YMM() VBROADCASTF128(Mem{Base: blendMaskStartPtr}, blend) VPBLENDVB(blend, tmp1, tmp2, currentBlockY) XORQ(n, n) JMP(LabelRef("loaded")) Comment("Process one 32B block of data") Label("process") Comment("Load the next block of bytes") VMOVDQU(Mem{Base: d}, currentBlockY) SUBQ(U8(32), n) ADDQ(U8(32), d) Label("loaded") Comment("Fast check to see if ASCII") tmp := GP32() VPMOVMSKB(currentBlockY, tmp) CMPL(tmp, Imm(0)) JNZ(LabelRef("non_ascii")) Comment("If this whole block is ASCII, there is nothing to do, and it is an error if any of the previous code point was incomplete.") VPOR(errorY, incompletePreviousBlockY, errorY) JMP(LabelRef("check_input")) Label("non_ascii") XORB(isAscii, isAscii) Comment("Prepare intermediate vector for push operations") vp := YMM() VPERM2I128(Imm(3), previousBlockY, currentBlockY, vp) Comment("Check errors on the high nibble of the previous byte") previousY := YMM() VPALIGNR(Imm(15), vp, currentBlockY, previousY) highPrev := highNibbles(previousY, nibbleMaskY) VPSHUFB(highPrev, nibble1Y, highPrev) Comment("Check errors on the low nibble of the previous byte") lowPrev := lowNibbles(previousY, nibbleMaskY) VPSHUFB(lowPrev, nibble2Y, lowPrev) VPAND(lowPrev, highPrev, highPrev) Comment("Check errors on the high nibble on the current byte") highCurr := highNibbles(currentBlockY, nibbleMaskY) VPSHUFB(highCurr, nibble3Y, highCurr) VPAND(highCurr, highPrev, highPrev) Comment("Find 3 bytes continuations") off2 := YMM() VPALIGNR(Imm(14), vp, currentBlockY, off2) VPSUBUSB(continuation3BytesY, off2, off2) Comment("Find 4 bytes continuations") off3 := YMM() VPALIGNR(Imm(13), vp, currentBlockY, off3) VPSUBUSB(continuation4BytesY, off3, off3) Comment("Combine them to have all continuations") continuationBitsY := YMM() VPOR(off2, off3, continuationBitsY) Comment("Perform a byte-sized signed comparison with zero to turn any non-zero bytes into 0xFF.") tmpY := zeroOutVector(YMM()) VPCMPGTB(tmpY, continuationBitsY, continuationBitsY) Comment("Find bytes that are continuations by looking at their most significant bit.") VPAND(msbMaskY, continuationBitsY, continuationBitsY) Comment("Find mismatches between expected and actual continuation bytes") VPXOR(continuationBitsY, highPrev, continuationBitsY) Comment("Store result in sticky error") VPOR(errorY, continuationBitsY, errorY) Comment("Prepare for next iteration") VPSUBUSB(incompleteMaskY, currentBlockY, incompletePreviousBlockY) VMOVDQU(currentBlockY, previousBlockY) Comment("End of loop") JMP(LabelRef("check_input")) Label("end") Comment("If the previous block was incomplete, this is an error.") VPOR(incompletePreviousBlockY, errorY, errorY) Comment("Return whether any error bit was set") VPTEST(errorY, errorY) out := GP8() SETEQ(out) Comment("Bit 0 tells if the input is valid utf8, bit 1 tells if it's valid ascii") ANDB(out, isAscii) SHLB(Imm(1), isAscii) ORB(isAscii, out) MOVB(out, ret.Addr) VZEROUPPER() RET() Generate() } func lowNibbles(a VecVirtual, nibbleMask VecVirtual) VecVirtual { out := YMM() VPAND(a, nibbleMask, out) return out } func highNibbles(a VecVirtual, nibbleMask VecVirtual) VecVirtual { out := YMM() VPSRLW(Imm(4), a, out) VPAND(out, nibbleMask, out) return out } func zeroOutVector(y VecVirtual) VecVirtual { VXORPS(y, y, y) return y } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/cpu/000077500000000000000000000000001452252572700224175ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/cpu/arm/000077500000000000000000000000001452252572700231765ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/cpu/arm/arm.go000066400000000000000000000056641452252572700243170ustar00rootroot00000000000000package arm import ( "github.com/segmentio/asm/cpu/cpuid" . "golang.org/x/sys/cpu" ) type CPU cpuid.CPU func (cpu CPU) Has(feature Feature) bool { return cpuid.CPU(cpu).Has(cpuid.Feature(feature)) } func (cpu *CPU) set(feature Feature, enable bool) { (*cpuid.CPU)(cpu).Set(cpuid.Feature(feature), enable) } type Feature cpuid.Feature const ( SWP Feature = 1 << iota // SWP instruction support HALF // Half-word load and store support THUMB // ARM Thumb instruction set BIT26 // Address space limited to 26-bits FASTMUL // 32-bit operand, 64-bit result multiplication support FPA // Floating point arithmetic support VFP // Vector floating point support EDSP // DSP Extensions support JAVA // Java instruction set IWMMXT // Intel Wireless MMX technology support CRUNCH // MaverickCrunch context switching and handling THUMBEE // Thumb EE instruction set NEON // NEON instruction set VFPv3 // Vector floating point version 3 support VFPv3D16 // Vector floating point version 3 D8-D15 TLS // Thread local storage support VFPv4 // Vector floating point version 4 support IDIVA // Integer divide instruction support in ARM mode IDIVT // Integer divide instruction support in Thumb mode VFPD32 // Vector floating point version 3 D15-D31 LPAE // Large Physical Address Extensions EVTSTRM // Event stream support AES // AES hardware implementation PMULL // Polynomial multiplication instruction set SHA1 // SHA1 hardware implementation SHA2 // SHA2 hardware implementation CRC32 // CRC32 hardware implementation ) func ABI() CPU { cpu := CPU(0) cpu.set(SWP, ARM.HasSWP) cpu.set(HALF, ARM.HasHALF) cpu.set(THUMB, ARM.HasTHUMB) cpu.set(BIT26, ARM.Has26BIT) cpu.set(FASTMUL, ARM.HasFASTMUL) cpu.set(FPA, ARM.HasFPA) cpu.set(VFP, ARM.HasVFP) cpu.set(EDSP, ARM.HasEDSP) cpu.set(JAVA, ARM.HasJAVA) cpu.set(IWMMXT, ARM.HasIWMMXT) cpu.set(CRUNCH, ARM.HasCRUNCH) cpu.set(THUMBEE, ARM.HasTHUMBEE) cpu.set(NEON, ARM.HasNEON) cpu.set(VFPv3, ARM.HasVFPv3) cpu.set(VFPv3D16, ARM.HasVFPv3D16) cpu.set(TLS, ARM.HasTLS) cpu.set(VFPv4, ARM.HasVFPv4) cpu.set(IDIVA, ARM.HasIDIVA) cpu.set(IDIVT, ARM.HasIDIVT) cpu.set(VFPD32, ARM.HasVFPD32) cpu.set(LPAE, ARM.HasLPAE) cpu.set(EVTSTRM, ARM.HasEVTSTRM) cpu.set(AES, ARM.HasAES) cpu.set(PMULL, ARM.HasPMULL) cpu.set(SHA1, ARM.HasSHA1) cpu.set(SHA2, ARM.HasSHA2) cpu.set(CRC32, ARM.HasCRC32) return cpu } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/cpu/arm64/000077500000000000000000000000001452252572700233505ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/cpu/arm64/arm64.go000066400000000000000000000054631452252572700246400ustar00rootroot00000000000000package arm64 import ( "github.com/segmentio/asm/cpu/cpuid" . "golang.org/x/sys/cpu" ) type CPU cpuid.CPU func (cpu CPU) Has(feature Feature) bool { return cpuid.CPU(cpu).Has(cpuid.Feature(feature)) } func (cpu *CPU) set(feature Feature, enable bool) { (*cpuid.CPU)(cpu).Set(cpuid.Feature(feature), enable) } type Feature cpuid.Feature const ( FP Feature = 1 << iota // Floating-point instruction set (always available) ASIMD // Advanced SIMD (always available) EVTSTRM // Event stream support AES // AES hardware implementation PMULL // Polynomial multiplication instruction set SHA1 // SHA1 hardware implementation SHA2 // SHA2 hardware implementation CRC32 // CRC32 hardware implementation ATOMICS // Atomic memory operation instruction set FPHP // Half precision floating-point instruction set ASIMDHP // Advanced SIMD half precision instruction set CPUID // CPUID identification scheme registers ASIMDRDM // Rounding double multiply add/subtract instruction set JSCVT // Javascript conversion from floating-point to integer FCMA // Floating-point multiplication and addition of complex numbers LRCPC // Release Consistent processor consistent support DCPOP // Persistent memory support SHA3 // SHA3 hardware implementation SM3 // SM3 hardware implementation SM4 // SM4 hardware implementation ASIMDDP // Advanced SIMD double precision instruction set SHA512 // SHA512 hardware implementation SVE // Scalable Vector Extensions ASIMDFHM // Advanced SIMD multiplication FP16 to FP32 ) func ABI() CPU { cpu := CPU(0) cpu.set(FP, ARM64.HasFP) cpu.set(ASIMD, ARM64.HasASIMD) cpu.set(EVTSTRM, ARM64.HasEVTSTRM) cpu.set(AES, ARM64.HasAES) cpu.set(PMULL, ARM64.HasPMULL) cpu.set(SHA1, ARM64.HasSHA1) cpu.set(SHA2, ARM64.HasSHA2) cpu.set(CRC32, ARM64.HasCRC32) cpu.set(ATOMICS, ARM64.HasATOMICS) cpu.set(FPHP, ARM64.HasFPHP) cpu.set(ASIMDHP, ARM64.HasASIMDHP) cpu.set(CPUID, ARM64.HasCPUID) cpu.set(ASIMDRDM, ARM64.HasASIMDRDM) cpu.set(JSCVT, ARM64.HasJSCVT) cpu.set(FCMA, ARM64.HasFCMA) cpu.set(LRCPC, ARM64.HasLRCPC) cpu.set(DCPOP, ARM64.HasDCPOP) cpu.set(SHA3, ARM64.HasSHA3) cpu.set(SM3, ARM64.HasSM3) cpu.set(SM4, ARM64.HasSM4) cpu.set(ASIMDDP, ARM64.HasASIMDDP) cpu.set(SHA512, ARM64.HasSHA512) cpu.set(SVE, ARM64.HasSVE) cpu.set(ASIMDFHM, ARM64.HasASIMDFHM) return cpu } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/cpu/cpu.go000066400000000000000000000011141452252572700235320ustar00rootroot00000000000000// Pakage cpu provides APIs to detect CPU features available at runtime. package cpu import ( "github.com/segmentio/asm/cpu/arm" "github.com/segmentio/asm/cpu/arm64" "github.com/segmentio/asm/cpu/x86" ) var ( // X86 is the bitset representing the set of the x86 instruction sets are // supported by the CPU. X86 = x86.ABI() // ARM is the bitset representing which parts of the arm instruction sets // are supported by the CPU. ARM = arm.ABI() // ARM64 is the bitset representing which parts of the arm64 instruction // sets are supported by the CPU. ARM64 = arm64.ABI() ) golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/cpu/cpu_test.go000066400000000000000000000054231452252572700246000ustar00rootroot00000000000000package cpu_test import ( "testing" "github.com/segmentio/asm/cpu/arm64" "github.com/segmentio/asm/cpu/cpuid" "github.com/segmentio/asm/cpu/x86" ) var x86Tests = map[string]cpuid.Feature{ "SSE": cpuid.Feature(x86.SSE), "SSE2": cpuid.Feature(x86.SSE2), "SSE3": cpuid.Feature(x86.SSE3), "SSE41": cpuid.Feature(x86.SSE41), "SSE42": cpuid.Feature(x86.SSE42), "SSE4A": cpuid.Feature(x86.SSE4A), "SSSE3": cpuid.Feature(x86.SSSE3), "AVX": cpuid.Feature(x86.AVX), "AVX2": cpuid.Feature(x86.AVX2), "AVX512BF16": cpuid.Feature(x86.AVX512BF16), "AVX512BITALG": cpuid.Feature(x86.AVX512BITALG), "AVX512BW": cpuid.Feature(x86.AVX512BW), "AVX512CD": cpuid.Feature(x86.AVX512CD), "AVX512DQ": cpuid.Feature(x86.AVX512DQ), "AVX512ER": cpuid.Feature(x86.AVX512ER), "AVX512F": cpuid.Feature(x86.AVX512F), "AVX512IFMA": cpuid.Feature(x86.AVX512IFMA), "AVX512PF": cpuid.Feature(x86.AVX512PF), "AVX512VBMI": cpuid.Feature(x86.AVX512VBMI), "AVX512VBMI2": cpuid.Feature(x86.AVX512VBMI2), "AVX512VL": cpuid.Feature(x86.AVX512VL), "AVX512VNNI": cpuid.Feature(x86.AVX512VNNI), "AVX512VP2INTERSECT": cpuid.Feature(x86.AVX512VP2INTERSECT), "AVX512VPOPCNTDQ": cpuid.Feature(x86.AVX512VPOPCNTDQ), } var arm64Tests = map[string]cpuid.Feature{ "ASIMD": cpuid.Feature(arm64.ASIMD), "ASIMDDP": cpuid.Feature(arm64.ASIMDDP), "ASIMDHP": cpuid.Feature(arm64.ASIMDHP), "ASIMDRDM": cpuid.Feature(arm64.ASIMDRDM), } func TestCPU(t *testing.T) { for _, test := range []struct { arch string feat map[string]cpuid.Feature }{ {arch: "x86", feat: x86Tests}, {arch: "arm64", feat: arm64Tests}, } { t.Run("none", func(t *testing.T) { c := cpuid.CPU(cpuid.None) for name, feature := range test.feat { t.Run(name, func(t *testing.T) { if c.Has(feature) { t.Error("cpuid.None must not have any features enabled") } }) } }) t.Run("all", func(t *testing.T) { c := cpuid.CPU(cpuid.All) for name, feature := range test.feat { t.Run(name, func(t *testing.T) { if !c.Has(feature) { t.Errorf("missing a feature that should have been enabled by cpuid.All") } }) } }) t.Run("single", func(t *testing.T) { for name, feature := range test.feat { t.Run(name, func(t *testing.T) { c := cpuid.CPU(0) c.Set(feature, true) for n, f := range test.feat { if n == name { if !c.Has(f) { t.Errorf("expected feature not set on CPU: %s", n) } } else { if c.Has(f) { t.Errorf("unexpected feature set on CPU: %s", n) } } } }) } }) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/cpu/cpuid/000077500000000000000000000000001452252572700235235ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/cpu/cpuid/cpuid.go000066400000000000000000000016171452252572700251630ustar00rootroot00000000000000// Package cpuid provides generic types used to represent CPU features supported // by the architecture. package cpuid // CPU is a bitset of feature flags representing the capabilities of various CPU // architeectures that this package provides optimized assembly routines for. // // The intent is to provide a stable ABI between the Go code that generate the // assembly, and the program that uses the library functions. type CPU uint64 // Feature represents a single CPU feature. type Feature uint64 const ( // None is a Feature value that has no CPU features enabled. None Feature = 0 // All is a Feature value that has all CPU features enabled. All Feature = 0xFFFFFFFFFFFFFFFF ) func (cpu CPU) Has(feature Feature) bool { return (Feature(cpu) & feature) == feature } func (cpu *CPU) Set(feature Feature, enabled bool) { if enabled { *cpu |= CPU(feature) } else { *cpu &= ^CPU(feature) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/cpu/x86/000077500000000000000000000000001452252572700230445ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/cpu/x86/x86.go000066400000000000000000000064241452252572700240260ustar00rootroot00000000000000package x86 import ( "github.com/segmentio/asm/cpu/cpuid" . "golang.org/x/sys/cpu" ) type CPU cpuid.CPU func (cpu CPU) Has(feature Feature) bool { return cpuid.CPU(cpu).Has(cpuid.Feature(feature)) } func (cpu *CPU) set(feature Feature, enable bool) { (*cpuid.CPU)(cpu).Set(cpuid.Feature(feature), enable) } type Feature cpuid.Feature const ( SSE Feature = 1 << iota // SSE functions SSE2 // P4 SSE functions SSE3 // Prescott SSE3 functions SSE41 // Penryn SSE4.1 functions SSE42 // Nehalem SSE4.2 functions SSE4A // AMD Barcelona microarchitecture SSE4a instructions SSSE3 // Conroe SSSE3 functions AVX // AVX functions AVX2 // AVX2 functions AVX512BF16 // AVX-512 BFLOAT16 Instructions AVX512BITALG // AVX-512 Bit Algorithms AVX512BW // AVX-512 Byte and Word Instructions AVX512CD // AVX-512 Conflict Detection Instructions AVX512DQ // AVX-512 Doubleword and Quadword Instructions AVX512ER // AVX-512 Exponential and Reciprocal Instructions AVX512F // AVX-512 Foundation AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions AVX512PF // AVX-512 Prefetch Instructions AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions AVX512VBMI2 // AVX-512 Vector Bit Manipulation Instructions, Version 2 AVX512VL // AVX-512 Vector Length Extensions AVX512VNNI // AVX-512 Vector Neural Network Instructions AVX512VP2INTERSECT // AVX-512 Intersect for D/Q AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword CMOV // Conditional move ) func ABI() CPU { cpu := CPU(0) cpu.set(SSE, true) // TODO: golang.org/x/sys/cpu assumes all CPUs have SEE? cpu.set(SSE2, X86.HasSSE2) cpu.set(SSE3, X86.HasSSE3) cpu.set(SSE41, X86.HasSSE41) cpu.set(SSE42, X86.HasSSE42) cpu.set(SSE4A, false) // TODO: add upstream support in golang.org/x/sys/cpu? cpu.set(SSSE3, X86.HasSSSE3) cpu.set(AVX, X86.HasAVX) cpu.set(AVX2, X86.HasAVX2) cpu.set(AVX512BF16, X86.HasAVX512BF16) cpu.set(AVX512BITALG, X86.HasAVX512BITALG) cpu.set(AVX512BW, X86.HasAVX512BW) cpu.set(AVX512CD, X86.HasAVX512CD) cpu.set(AVX512DQ, X86.HasAVX512DQ) cpu.set(AVX512ER, X86.HasAVX512ER) cpu.set(AVX512F, X86.HasAVX512F) cpu.set(AVX512IFMA, X86.HasAVX512IFMA) cpu.set(AVX512PF, X86.HasAVX512PF) cpu.set(AVX512VBMI, X86.HasAVX512VBMI) cpu.set(AVX512VBMI2, X86.HasAVX512VBMI2) cpu.set(AVX512VL, X86.HasAVX512VL) cpu.set(AVX512VNNI, X86.HasAVX512VNNI) cpu.set(AVX512VP2INTERSECT, false) // TODO: add upstream support in golang.org/x/sys/cpu? cpu.set(AVX512VPOPCNTDQ, X86.HasAVX512VPOPCNTDQ) cpu.set(CMOV, true) // TODO: golang.org/x/sys/cpu assumes all CPUs have CMOV? return cpu } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/go.mod000066400000000000000000000001461452252572700227370ustar00rootroot00000000000000module github.com/segmentio/asm go 1.18 require golang.org/x/sys v0.0.0-20220412211240-33da011f77ad golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/go.sum000066400000000000000000000003171452252572700227640ustar00rootroot00000000000000golang.org/x/sys v0.0.0-20220412211240-33da011f77ad h1:ntjMns5wyP/fN65tdBD4g8J5w8n015+iIIs9rtjXkY0= golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/internal/000077500000000000000000000000001452252572700234445ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/internal/buffer/000077500000000000000000000000001452252572700247155ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/internal/buffer/buffer_default.go000066400000000000000000000010131452252572700302140ustar00rootroot00000000000000//go:build purego || (!aix && !android && !darwin && !dragonfly && !freebsd && !illumos && !ios && !linux && !netbsd && !openbsd && !plan9 && !solaris) // +build purego !aix,!android,!darwin,!dragonfly,!freebsd,!illumos,!ios,!linux,!netbsd,!openbsd,!plan9,!solaris package buffer type Buffer []byte func New(n int) (Buffer, error) { return make([]byte, n), nil } func (a *Buffer) ProtectHead() []byte { return []byte(*a) } func (a *Buffer) ProtectTail() []byte { return []byte(*a) } func (a *Buffer) Release() { } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/internal/buffer/buffer_mmap.go000066400000000000000000000022121452252572700275240ustar00rootroot00000000000000//go:build !purego && (aix || android || darwin || dragonfly || freebsd || illumos || ios || linux || netbsd || openbsd || plan9 || solaris) // +build !purego // +build aix android darwin dragonfly freebsd illumos ios linux netbsd openbsd plan9 solaris // TODO: replace the above with go:build unix once Go 1.19 is the lowest // supported version package buffer import ( "golang.org/x/sys/unix" ) type Buffer struct { n int pg int mmap []byte } func New(n int) (Buffer, error) { pg := unix.Getpagesize() full := ((n+(pg-1))/pg + 2) * pg b, err := unix.Mmap(-1, 0, full, unix.PROT_NONE, unix.MAP_ANON|unix.MAP_PRIVATE) if err != nil { return Buffer{}, err } if n > 0 { err = unix.Mprotect(b[pg:full-pg], unix.PROT_READ|unix.PROT_WRITE) if err != nil { unix.Munmap(b) return Buffer{}, err } } return Buffer{ n: n, pg: pg, mmap: b, }, nil } func (a *Buffer) ProtectHead() []byte { head := a.pg return a.mmap[head : head+a.n : head+a.n] } func (a *Buffer) ProtectTail() []byte { tail := len(a.mmap) - a.pg - a.n return a.mmap[tail : tail+a.n : tail+a.n] } func (a *Buffer) Release() { unix.Munmap(a.mmap) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/internal/modulo.go000066400000000000000000000006141452252572700252730ustar00rootroot00000000000000package internal func MultipleOf(size, n int) bool { return (isPowTwo(size) && modPowTwo(n, size) == 0) || n%size == 0 } func PairMultipleOf(size, n, m int) bool { return (isPowTwo(size) && modPowTwo(n, size) == 0 && modPowTwo(m, size) == 0) || (n%size == 0 && m%size == 0) } func isPowTwo(n int) bool { return modPowTwo(n, n) == 0 } func modPowTwo(n, m int) int { return n & (m - 1) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/internal/unsafebytes/000077500000000000000000000000001452252572700257745ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/internal/unsafebytes/unsafebytes.go000066400000000000000000000005211452252572700306510ustar00rootroot00000000000000package unsafebytes import "unsafe" func Pointer(b []byte) *byte { return *(**byte)(unsafe.Pointer(&b)) } func String(b []byte) string { return *(*string)(unsafe.Pointer(&b)) } func BytesOf(s string) []byte { return *(*[]byte)(unsafe.Pointer(&sliceHeader{str: s, cap: len(s)})) } type sliceHeader struct { str string cap int } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/keyset/000077500000000000000000000000001452252572700231345ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/keyset/keyset.go000066400000000000000000000017111452252572700247670ustar00rootroot00000000000000package keyset import ( "bytes" "github.com/segmentio/asm/cpu" "github.com/segmentio/asm/cpu/arm64" "github.com/segmentio/asm/cpu/x86" ) // New prepares a set of keys for use with Lookup. // // An optimized routine is used if the processor supports AVX instructions and // the maximum length of any of the keys is less than or equal to 16. If New // returns nil, this indicates that an optimized routine is not available, and // the caller should use a fallback. func New(keys [][]byte) []byte { maxWidth, hasNullByte := checkKeys(keys) if hasNullByte || maxWidth > 16 || !(cpu.X86.Has(x86.AVX) || cpu.ARM64.Has(arm64.ASIMD)) { return nil } set := make([]byte, len(keys)*16) for i, k := range keys { copy(set[i*16:], k) } return set } func checkKeys(keys [][]byte) (maxWidth int, hasNullByte bool) { for _, k := range keys { if len(k) > maxWidth { maxWidth = len(k) } if bytes.IndexByte(k, 0) >= 0 { hasNullByte = true } } return } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/keyset/keyset_amd64.go000066400000000000000000000005501452252572700257620ustar00rootroot00000000000000// Code generated by command: go run keyset_asm.go -pkg keyset -out ../keyset/keyset_amd64.s -stubs ../keyset/keyset_amd64.go. DO NOT EDIT. //go:build !purego package keyset // Lookup searches for a key in a set of keys, returning its index if // found. If the key cannot be found, the number of keys is returned. func Lookup(keyset []byte, key []byte) int golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/keyset/keyset_amd64.s000066400000000000000000000041101452252572700256130ustar00rootroot00000000000000// Code generated by command: go run keyset_asm.go -pkg keyset -out ../keyset/keyset_amd64.s -stubs ../keyset/keyset_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func Lookup(keyset []byte, key []byte) int // Requires: AVX TEXT ·Lookup(SB), NOSPLIT, $0-56 MOVQ keyset_base+0(FP), AX MOVQ keyset_len+8(FP), CX SHRQ $0x04, CX MOVQ key_base+24(FP), DX MOVQ key_len+32(FP), BX MOVQ key_cap+40(FP), SI CMPQ BX, $0x10 JA not_found CMPQ SI, $0x10 JB safe_load load: VMOVUPS (DX), X0 prepare: VPXOR X2, X2, X2 VPCMPEQB X1, X1, X1 LEAQ blend_masks<>+16(SB), DX SUBQ BX, DX VMOVUPS (DX), X3 VPBLENDVB X3, X0, X2, X0 XORQ DX, DX MOVQ CX, BX SHRQ $0x02, BX SHLQ $0x02, BX bigloop: CMPQ DX, BX JE loop VPCMPEQB (AX), X0, X8 VPTEST X1, X8 JCS done VPCMPEQB 16(AX), X0, X9 VPTEST X1, X9 JCS found1 VPCMPEQB 32(AX), X0, X10 VPTEST X1, X10 JCS found2 VPCMPEQB 48(AX), X0, X11 VPTEST X1, X11 JCS found3 ADDQ $0x04, DX ADDQ $0x40, AX JMP bigloop loop: CMPQ DX, CX JE done VPCMPEQB (AX), X0, X2 VPTEST X1, X2 JCS done INCQ DX ADDQ $0x10, AX JMP loop JMP done found3: INCQ DX found2: INCQ DX found1: INCQ DX done: MOVQ DX, ret+48(FP) RET not_found: MOVQ CX, ret+48(FP) RET safe_load: MOVQ DX, SI ANDQ $0x00000fff, SI CMPQ SI, $0x00000ff0 JBE load MOVQ $0xfffffffffffffff0, SI ADDQ BX, SI VMOVUPS (DX)(SI*1), X0 LEAQ shuffle_masks<>+16(SB), DX SUBQ BX, DX VMOVUPS (DX), X1 VPSHUFB X1, X0, X0 JMP prepare DATA blend_masks<>+0(SB)/8, $0xffffffffffffffff DATA blend_masks<>+8(SB)/8, $0xffffffffffffffff DATA blend_masks<>+16(SB)/8, $0x0000000000000000 DATA blend_masks<>+24(SB)/8, $0x0000000000000000 GLOBL blend_masks<>(SB), RODATA|NOPTR, $32 DATA shuffle_masks<>+0(SB)/8, $0x0706050403020100 DATA shuffle_masks<>+8(SB)/8, $0x0f0e0d0c0b0a0908 DATA shuffle_masks<>+16(SB)/8, $0x0706050403020100 DATA shuffle_masks<>+24(SB)/8, $0x0f0e0d0c0b0a0908 GLOBL shuffle_masks<>(SB), RODATA|NOPTR, $32 golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/keyset/keyset_arm64.go000066400000000000000000000003551452252572700260030ustar00rootroot00000000000000//go:build !purego // +build !purego package keyset // Lookup searches for a key in a set of keys, returning its index if // found. If the key cannot be found, the number of keys is returned. func Lookup(keyset []byte, key []byte) int golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/keyset/keyset_arm64.s000066400000000000000000000111541452252572700256370ustar00rootroot00000000000000//go:build !purego // +build !purego #include "textflag.h" // func Lookup(keyset []byte, key []byte) int TEXT ·Lookup(SB), NOSPLIT, $0-56 MOVD keyset+0(FP), R0 MOVD keyset_len+8(FP), R1 MOVD key+24(FP), R2 MOVD key_len+32(FP), R3 MOVD key_cap+40(FP), R4 // None of the keys in the set are greater than 16 bytes, so if the input // key is we can jump straight to not found. CMP $16, R3 BHI notfound // We'll be moving the keyset pointer (R0) forward as we compare keys, so // make a copy of the starting point (R6). Also add the byte length (R1) to // obtain a pointer to the end of the keyset (R5). MOVD R0, R6 ADD R0, R1, R5 // Prepare a 64-bit mask of all ones. MOVD $-1, R7 // Prepare a vector of all zeroes. VMOV ZR, V1.B16 // Check that it's safe to load 16 bytes of input. If cap(input)<16, jump // to a check that determines whether a tail load is necessary (to avoid a // page fault). CMP $16, R4 BLO safeload load: // Load the input key (V0) and pad with zero bytes (V1). To blend the two // vectors, we load a mask for the particular key length and then use TBL // to select bytes from either V0 or V1. VLD1 (R2), [V0.B16] MOVD $blend_masks<>(SB), R10 ADD R3<<4, R10, R10 VLD1 (R10), [V2.B16] VTBL V2.B16, [V0.B16, V1.B16], V3.B16 loop: // Loop through each 16 byte key in the keyset. CMP R0, R5 BEQ notfound // Load and compare the next key. VLD1.P 16(R0), [V4.B16] VCMEQ V3.B16, V4.B16, V5.B16 VMOV V5.D[0], R8 VMOV V5.D[1], R9 AND R8, R9, R9 // If the masks match, we found the key. CMP R9, R7 BEQ found JMP loop found: // If the key was found, take the position in the keyset and convert it // to an index. The keyset pointer (R0) will be 1 key past the match, so // subtract the starting pointer (R6), divide by 16 to convert from byte // length to an index, and then subtract one. SUB R6, R0, R0 ADD R0>>4, ZR, R0 SUB $1, R0, R0 MOVD R0, ret+48(FP) RET notfound: // Return the number of keys in the keyset, which is the byte length (R1) // divided by 16. ADD R1>>4, ZR, R1 MOVD R1, ret+48(FP) RET safeload: // Check if the input crosses a page boundary. If not, jump back. AND $4095, R2, R12 CMP $4080, R12 BLS load // If it does cross a page boundary, we must assume that loading 16 bytes // will cause a fault. Instead, we load the 16 bytes up to and including the // key and then shuffle the key forward in the register. We can shuffle and // pad with zeroes at the same time to avoid having to also blend (as load // does). MOVD $16, R12 SUB R3, R12, R12 SUB R12, R2, R2 VLD1 (R2), [V0.B16] MOVD $shuffle_masks<>(SB), R10 ADD R12, R10, R10 VLD1 (R10), [V2.B16] VTBL V2.B16, [V0.B16, V1.B16], V3.B16 JMP loop DATA blend_masks<>+0(SB)/8, $0x1010101010101010 DATA blend_masks<>+8(SB)/8, $0x1010101010101010 DATA blend_masks<>+16(SB)/8, $0x1010101010101000 DATA blend_masks<>+24(SB)/8, $0x1010101010101010 DATA blend_masks<>+32(SB)/8, $0x1010101010100100 DATA blend_masks<>+40(SB)/8, $0x1010101010101010 DATA blend_masks<>+48(SB)/8, $0x1010101010020100 DATA blend_masks<>+56(SB)/8, $0x1010101010101010 DATA blend_masks<>+64(SB)/8, $0x1010101003020100 DATA blend_masks<>+72(SB)/8, $0x1010101010101010 DATA blend_masks<>+80(SB)/8, $0x1010100403020100 DATA blend_masks<>+88(SB)/8, $0x1010101010101010 DATA blend_masks<>+96(SB)/8, $0x1010050403020100 DATA blend_masks<>+104(SB)/8, $0x1010101010101010 DATA blend_masks<>+112(SB)/8, $0x1006050403020100 DATA blend_masks<>+120(SB)/8, $0x1010101010101010 DATA blend_masks<>+128(SB)/8, $0x0706050403020100 DATA blend_masks<>+136(SB)/8, $0x1010101010101010 DATA blend_masks<>+144(SB)/8, $0x0706050403020100 DATA blend_masks<>+152(SB)/8, $0x1010101010101008 DATA blend_masks<>+160(SB)/8, $0x0706050403020100 DATA blend_masks<>+168(SB)/8, $0x1010101010100908 DATA blend_masks<>+176(SB)/8, $0x0706050403020100 DATA blend_masks<>+184(SB)/8, $0x10101010100A0908 DATA blend_masks<>+192(SB)/8, $0x0706050403020100 DATA blend_masks<>+200(SB)/8, $0x101010100B0A0908 DATA blend_masks<>+208(SB)/8, $0x0706050403020100 DATA blend_masks<>+216(SB)/8, $0x1010100C0B0A0908 DATA blend_masks<>+224(SB)/8, $0x0706050403020100 DATA blend_masks<>+232(SB)/8, $0x10100D0C0B0A0908 DATA blend_masks<>+240(SB)/8, $0x0706050403020100 DATA blend_masks<>+248(SB)/8, $0x100E0D0C0B0A0908 DATA blend_masks<>+256(SB)/8, $0x0706050403020100 DATA blend_masks<>+264(SB)/8, $0x0F0E0D0C0B0A0908 GLOBL blend_masks<>(SB), RODATA|NOPTR, $272 DATA shuffle_masks<>+0(SB)/8, $0x0706050403020100 DATA shuffle_masks<>+8(SB)/8, $0x0F0E0D0C0B0A0908 DATA shuffle_masks<>+16(SB)/8, $0x1010101010101010 DATA shuffle_masks<>+24(SB)/8, $0x1010101010101010 GLOBL shuffle_masks<>(SB), RODATA|NOPTR, $32 golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/keyset/keyset_default.go000066400000000000000000000005571452252572700265020ustar00rootroot00000000000000//go:build purego || !(amd64 || arm64) // +build purego !amd64,!arm64 package keyset func Lookup(keyset []byte, key []byte) int { if len(key) > 16 { return len(keyset) / 16 } var padded [16]byte copy(padded[:], key) for i := 0; i < len(keyset); i += 16 { if string(padded[:]) == string(keyset[i:i+16]) { return i / 16 } } return len(keyset) / 16 } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/keyset/keyset_test.go000066400000000000000000000103321452252572700260250ustar00rootroot00000000000000package keyset import ( "bytes" "fmt" "math/rand" "strconv" "testing" "github.com/segmentio/asm/internal/buffer" ) func TestKeySet(t *testing.T) { const max = 23 keys := make([][]byte, max) for i := range keys { keys[i] = []byte(strconv.Itoa(max - i)) } for i := 0; i < max; i++ { subset := keys[:i] keyset := New(subset) if keyset == nil { t.Skip("not implemented") } for j := range subset { if n := Lookup(keyset, subset[j]); n != j { t.Errorf("unexpected index for known key: %d, expected %d", n, j) } } if n := Lookup(keyset, []byte(fmt.Sprintf("key-%d", i+1))); n != len(subset) { t.Errorf("unexpected index for unknown key: %d", n) } } for i := 0; i < max; i++ { for j := 0; j <= 16; j++ { key := bytes.Repeat([]byte("x"), j) keyset := New(append(keys[:i:i], key)) if n := Lookup(keyset, key); n != i { t.Errorf("unexpected index for known key: %d", n) } if j > 0 { if n := Lookup(keyset, key[:j-1]); n != i+1 { t.Errorf("unexpected match: %d", n) } } if n := Lookup(keyset, append(key, 'x')); n != i+1 { t.Errorf("unexpected match: %d", n) } } } if New([][]byte{[]byte("foo\x00bar")}) != nil { t.Error("keyset was created when key contained null byte") } if New([][]byte{bytes.Repeat([]byte{'x'}, 17)}) != nil { t.Error("keyset was created when key was longer than 16 bytes") } } const hex = "0123456789abcdef" func TestPageBoundary(t *testing.T) { buf, err := buffer.New(16) if err != nil { t.Fatal(err) } defer buf.Release() head := buf.ProtectHead() tail := buf.ProtectTail() copy(head, hex) copy(tail, hex) for i := 0; i <= 16; i++ { key := head[:i] keyset := New([][]byte{[]byte("foo"), []byte("bar"), key}) if keyset == nil { t.Skip("not implemented") } if n := Lookup(keyset, key); n != 2 { t.Errorf("unexpected lookup result %d", n) } } for i := 0; i <= 16; i++ { key := tail[i:] keyset := New([][]byte{[]byte("foo"), []byte("bar"), key}) if n := Lookup(keyset, key); n != 2 { t.Errorf("unexpected lookup result for i=%d: %d", i, n) } } } func BenchmarkKeySet(b *testing.B) { keys := make([][]byte, 32) m := map[string]int{} for i := range keys { k := "key-" + strconv.Itoa(i) // k := strings.Repeat(strconv.Itoa(i), i) if len(k) > 16 { k = k[:16] } keys[i] = []byte(k) m[k] = i } prng := rand.New(rand.NewSource(0)) const permutations = 1000 // enough to throw off the branch predictor hopeully r := make([]int, len(keys)*permutations) for i := 0; i < permutations; i++ { x := r[i*len(keys):][:len(keys)] for j := range x { x[j] = j } prng.Shuffle(len(keys), func(a, b int) { x[a], x[b] = x[b], x[a] }) } keyset := New(keys) if keyset == nil { b.Skip("not implemented") } b.Run("map-lookup-first", func(b *testing.B) { first := keys[0] b.ResetTimer() for i := 0; i < b.N; i++ { _ = m[string(first)] } }) b.Run("keyset-lookup-first", func(b *testing.B) { first := keys[0] b.ResetTimer() for i := 0; i < b.N; i++ { Lookup(keyset, first) } }) b.Run("map-lookup-last", func(b *testing.B) { last := keys[len(keys)-1] b.ResetTimer() for i := 0; i < b.N; i++ { _ = m[string(last)] } }) b.Run("keyset-lookup-last", func(b *testing.B) { last := keys[len(keys)-1] b.ResetTimer() for i := 0; i < b.N; i++ { Lookup(keyset, last) } }) b.Run("map-ordered-iteration", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { for _, k := range keys { _ = m[string(k)] } } }) b.Run("keyset-ordered-iteration", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { for _, k := range keys { Lookup(keyset, k) } } }) b.Run("map-random-iteration", func(b *testing.B) { prng := rand.New(rand.NewSource(0)) b.ResetTimer() for i := 0; i < b.N; i++ { p := prng.Intn(permutations) permutation := r[p*len(keys):][:len(keys)] for _, i := range permutation { _ = m[string(keys[i])] } } }) b.Run("keyset-random-iteration", func(b *testing.B) { prng := rand.New(rand.NewSource(0)) b.ResetTimer() for i := 0; i < b.N; i++ { p := prng.Intn(permutations) permutation := r[p*len(keys):][:len(keys)] for _, i := range permutation { Lookup(keyset, keys[i]) } } }) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/000077500000000000000000000000001452252572700224065ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/blend.go000066400000000000000000000003501452252572700240170ustar00rootroot00000000000000package mem func blendGeneric(dst, src []byte) int { switch { case len(dst) < len(src): src = src[:len(dst)] case len(dst) > len(src): dst = dst[:len(src)] } for i := range dst { dst[i] |= src[i] } return len(dst) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/blend_amd64.go000066400000000000000000000004341452252572700250150ustar00rootroot00000000000000// Code generated by command: go run blend_asm.go -pkg mem -out ../mem/blend_amd64.s -stubs ../mem/blend_amd64.go. DO NOT EDIT. //go:build !purego package mem // Blend copies the one-bits of src to dst, returning the number of bytes written. func Blend(dst []byte, src []byte) int golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/blend_amd64.s000066400000000000000000000066041452252572700246570ustar00rootroot00000000000000// Code generated by command: go run blend_asm.go -pkg mem -out ../mem/blend_amd64.s -stubs ../mem/blend_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func Blend(dst []byte, src []byte) int // Requires: AVX, AVX2, CMOV, SSE2 TEXT ·Blend(SB), NOSPLIT, $0-56 MOVQ dst_base+0(FP), AX MOVQ src_base+24(FP), CX MOVQ dst_len+8(FP), DX MOVQ src_len+32(FP), BX CMPQ BX, DX CMOVQLT BX, DX MOVQ DX, ret+48(FP) tail: CMPQ DX, $0x00 JE done CMPQ DX, $0x01 JE handle1 CMPQ DX, $0x03 JBE handle2to3 CMPQ DX, $0x04 JE handle4 CMPQ DX, $0x08 JB handle5to7 JE handle8 CMPQ DX, $0x10 JBE handle9to16 CMPQ DX, $0x20 JBE handle17to32 CMPQ DX, $0x40 JBE handle33to64 BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCC generic CMPQ DX, $0x00000080 JB avx2_tail JMP avx2 generic: MOVOU (CX), X0 MOVOU (AX), X1 MOVOU 16(CX), X2 MOVOU 16(AX), X3 MOVOU 32(CX), X4 MOVOU 32(AX), X5 MOVOU 48(CX), X6 MOVOU 48(AX), X7 POR X1, X0 POR X3, X2 POR X5, X4 POR X7, X6 MOVOU X0, (AX) MOVOU X2, 16(AX) MOVOU X4, 32(AX) MOVOU X6, 48(AX) ADDQ $0x40, CX ADDQ $0x40, AX SUBQ $0x40, DX CMPQ DX, $0x40 JBE tail JMP generic done: RET handle1: MOVB (CX), CL MOVB (AX), DL ORB DL, CL MOVB CL, (AX) RET handle2to3: MOVW (CX), BX MOVW (AX), SI MOVW -2(CX)(DX*1), CX MOVW -2(AX)(DX*1), DI ORW SI, BX ORW DI, CX MOVW BX, (AX) MOVW CX, -2(AX)(DX*1) RET handle4: MOVL (CX), CX MOVL (AX), DX ORL DX, CX MOVL CX, (AX) RET handle5to7: MOVL (CX), BX MOVL (AX), SI MOVL -4(CX)(DX*1), CX MOVL -4(AX)(DX*1), DI ORL SI, BX ORL DI, CX MOVL BX, (AX) MOVL CX, -4(AX)(DX*1) RET handle8: MOVQ (CX), CX MOVQ (AX), DX ORQ DX, CX MOVQ CX, (AX) RET handle9to16: MOVQ (CX), BX MOVQ (AX), SI MOVQ -8(CX)(DX*1), CX MOVQ -8(AX)(DX*1), DI ORQ SI, BX ORQ DI, CX MOVQ BX, (AX) MOVQ CX, -8(AX)(DX*1) RET handle17to32: MOVOU (CX), X0 MOVOU (AX), X1 MOVOU -16(CX)(DX*1), X2 MOVOU -16(AX)(DX*1), X3 POR X1, X0 POR X3, X2 MOVOU X0, (AX) MOVOU X2, -16(AX)(DX*1) RET handle33to64: MOVOU (CX), X0 MOVOU (AX), X1 MOVOU 16(CX), X2 MOVOU 16(AX), X3 MOVOU -32(CX)(DX*1), X4 MOVOU -32(AX)(DX*1), X5 MOVOU -16(CX)(DX*1), X6 MOVOU -16(AX)(DX*1), X7 POR X1, X0 POR X3, X2 POR X5, X4 POR X7, X6 MOVOU X0, (AX) MOVOU X2, 16(AX) MOVOU X4, -32(AX)(DX*1) MOVOU X6, -16(AX)(DX*1) RET // AVX optimized version for medium to large size inputs. avx2: VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VPOR (AX), Y0, Y0 VPOR 32(AX), Y1, Y1 VPOR 64(AX), Y2, Y2 VPOR 96(AX), Y3, Y3 VMOVDQU Y0, (AX) VMOVDQU Y1, 32(AX) VMOVDQU Y2, 64(AX) VMOVDQU Y3, 96(AX) ADDQ $0x00000080, CX ADDQ $0x00000080, AX SUBQ $0x00000080, DX JZ avx2_done CMPQ DX, $0x00000080 JAE avx2 avx2_tail: CMPQ DX, $0x40 JBE avx2_tail_1to64 VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU -64(CX)(DX*1), Y2 VMOVDQU -32(CX)(DX*1), Y3 VPOR (AX), Y0, Y0 VPOR 32(AX), Y1, Y1 VPOR -64(AX)(DX*1), Y2, Y2 VPOR -32(AX)(DX*1), Y3, Y3 VMOVDQU Y0, (AX) VMOVDQU Y1, 32(AX) VMOVDQU Y2, -64(AX)(DX*1) VMOVDQU Y3, -32(AX)(DX*1) JMP avx2_done avx2_tail_1to64: VMOVDQU -64(CX)(DX*1), Y0 VMOVDQU -32(CX)(DX*1), Y1 VPOR -64(AX)(DX*1), Y0, Y0 VPOR -32(AX)(DX*1), Y1, Y1 VMOVDQU Y0, -64(AX)(DX*1) VMOVDQU Y1, -32(AX)(DX*1) avx2_done: VZEROUPPER RET golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/blend_default.go000066400000000000000000000003461452252572700255300ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package mem // Blend performs a OR of src and dst into dst, returning the number of bytes // written to dst. func Blend(dst, src []byte) int { return blendGeneric(dst, src) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/blend_test.go000066400000000000000000000002421452252572700250560ustar00rootroot00000000000000package mem import "testing" func TestBlend(t *testing.T) { testCopy(t, Blend, blendGeneric) } func BenchmarkBlend(b *testing.B) { benchmarkCopy(b, Blend) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/contains.go000066400000000000000000000002111452252572700245450ustar00rootroot00000000000000package mem import "bytes" func containsGeneric(haystack []byte, needle byte) bool { return bytes.IndexByte(haystack, needle) != -1 } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/contains_amd64.go000066400000000000000000000003401452252572700255430ustar00rootroot00000000000000// Code generated by command: go run contains_asm.go -pkg mem -out ../mem/contains_amd64.s -stubs ../mem/contains_amd64.go. DO NOT EDIT. //go:build !purego package mem func ContainsByte(haystack []byte, needle byte) bool golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/contains_amd64.s000066400000000000000000000110251452252572700254020ustar00rootroot00000000000000// Code generated by command: go run contains_asm.go -pkg mem -out ../mem/contains_amd64.s -stubs ../mem/contains_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func ContainsByte(haystack []byte, needle byte) bool // Requires: AVX, AVX2, SSE2, SSE4.1 TEXT ·ContainsByte(SB), NOSPLIT, $0-33 MOVQ haystack_base+0(FP), AX MOVQ haystack_len+8(FP), CX XORQ DX, DX MOVB needle+24(FP), DL MOVQ DX, BX SHLQ $0x08, BX ORQ BX, DX MOVQ DX, BX SHLQ $0x10, BX ORQ BX, DX MOVQ DX, BX SHLQ $0x20, BX ORQ BX, DX MOVQ $0x0101010101010101, BX MOVQ $0x8080808080808080, SI MOVB $0x00, ret+32(FP) JMP start found: MOVB $0x01, ret+32(FP) JMP done avx2_found: MOVB $0x01, ret+32(FP) JMP avx2_done start: CMPQ CX, $0x10 JBE tail PXOR X1, X1 PINSRQ $0x00, DX, X0 PINSRQ $0x01, DX, X0 tail: CMPQ CX, $0x00 JE done CMPQ CX, $0x01 JE handle1 CMPQ CX, $0x03 JBE handle2to3 CMPQ CX, $0x04 JE handle4 CMPQ CX, $0x08 JB handle5to7 JE handle8 CMPQ CX, $0x10 JBE handle9to16 CMPQ CX, $0x20 JBE handle17to32 CMPQ CX, $0x40 JBE handle33to64 BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCC generic VZEROUPPER VPBROADCASTQ X0, Y0 CMPQ CX, $0x00000100 JB avx2_tail JMP avx2 generic: MOVOU (AX), X2 MOVOU 16(AX), X3 MOVOU 32(AX), X4 MOVOU 48(AX), X5 PCMPEQB X0, X2 PCMPEQB X0, X3 PCMPEQB X0, X4 PCMPEQB X0, X5 POR X2, X3 POR X4, X5 POR X3, X5 PTEST X5, X1 JCC found ADDQ $0x40, AX SUBQ $0x40, CX CMPQ CX, $0x40 JBE tail JMP generic done: RET handle1: MOVB (AX), AL CMPB AL, DL JE found RET handle2to3: MOVW (AX), DI MOVW -2(AX)(CX*1), AX XORW DX, DI MOVW DI, CX XORW DX, AX MOVW AX, DX SUBW BX, CX NOTW DI ANDW DI, CX SUBW BX, DX NOTW AX ANDW AX, DX ORW CX, DX ANDW SI, DX JNZ found RET handle4: MOVL (AX), AX XORL DX, AX MOVL AX, CX SUBL BX, CX NOTL AX ANDL AX, CX ANDL SI, CX JNZ found RET handle5to7: MOVL (AX), DI MOVL -4(AX)(CX*1), AX XORL DX, DI MOVL DI, CX XORL DX, AX MOVL AX, DX SUBL BX, CX NOTL DI ANDL DI, CX SUBL BX, DX NOTL AX ANDL AX, DX ORL CX, DX ANDL SI, DX JNZ found RET handle8: MOVQ (AX), AX XORQ DX, AX MOVQ AX, CX SUBQ BX, CX NOTQ AX ANDQ AX, CX ANDQ SI, CX JNZ found RET handle9to16: MOVQ (AX), DI MOVQ -8(AX)(CX*1), AX XORQ DX, DI MOVQ DI, CX XORQ DX, AX MOVQ AX, DX SUBQ BX, CX NOTQ DI ANDQ DI, CX SUBQ BX, DX NOTQ AX ANDQ AX, DX ORQ CX, DX ANDQ SI, DX JNZ found RET handle17to32: MOVOU (AX), X2 MOVOU -16(AX)(CX*1), X3 PCMPEQB X0, X2 PCMPEQB X0, X3 POR X2, X3 PTEST X3, X1 JCC found RET handle33to64: MOVOU (AX), X2 MOVOU 16(AX), X3 MOVOU -32(AX)(CX*1), X4 MOVOU -16(AX)(CX*1), X5 PCMPEQB X0, X2 PCMPEQB X0, X3 PCMPEQB X0, X4 PCMPEQB X0, X5 POR X2, X3 POR X4, X5 POR X3, X5 PTEST X5, X1 JCC found RET // AVX optimized version for medium to large size inputs. avx2: VPCMPEQB (AX), Y0, Y2 VPCMPEQB 32(AX), Y0, Y3 VPCMPEQB 64(AX), Y0, Y4 VPCMPEQB 96(AX), Y0, Y5 VPCMPEQB 128(AX), Y0, Y6 VPCMPEQB 160(AX), Y0, Y7 VPCMPEQB 192(AX), Y0, Y8 VPCMPEQB 224(AX), Y0, Y9 VPOR Y2, Y3, Y3 VPOR Y4, Y5, Y5 VPOR Y6, Y7, Y7 VPOR Y8, Y9, Y9 VPOR Y3, Y5, Y5 VPOR Y7, Y9, Y9 VPOR Y5, Y9, Y9 VPTEST Y9, Y1 JCC avx2_found ADDQ $0x00000100, AX SUBQ $0x00000100, CX JZ avx2_done CMPQ CX, $0x00000100 JAE avx2 avx2_tail: CMPQ CX, $0x40 JBE avx2_tail_1to64 CMPQ CX, $0x80 JBE avx2_tail_65to128 VPCMPEQB (AX), Y0, Y2 VPCMPEQB 32(AX), Y0, Y3 VPCMPEQB 64(AX), Y0, Y4 VPCMPEQB 96(AX), Y0, Y5 VPCMPEQB -128(AX)(CX*1), Y0, Y6 VPCMPEQB -96(AX)(CX*1), Y0, Y7 VPCMPEQB -64(AX)(CX*1), Y0, Y8 VPCMPEQB -32(AX)(CX*1), Y0, Y0 VPOR Y2, Y3, Y3 VPOR Y4, Y5, Y5 VPOR Y6, Y7, Y7 VPOR Y8, Y0, Y0 VPOR Y3, Y5, Y5 VPOR Y7, Y0, Y0 VPOR Y5, Y0, Y0 VPTEST Y0, Y1 JCC avx2_found JMP avx2_done avx2_tail_65to128: VPCMPEQB (AX), Y0, Y2 VPCMPEQB 32(AX), Y0, Y3 VPCMPEQB -64(AX)(CX*1), Y0, Y4 VPCMPEQB -32(AX)(CX*1), Y0, Y0 VPOR Y2, Y3, Y3 VPOR Y4, Y0, Y0 VPOR Y3, Y0, Y0 VPTEST Y0, Y1 JCC avx2_found JMP avx2_done avx2_tail_1to64: VPCMPEQB -64(AX)(CX*1), Y0, Y2 VPCMPEQB -32(AX)(CX*1), Y0, Y0 VPOR Y2, Y0, Y0 VPTEST Y0, Y1 JCC avx2_found avx2_done: VZEROUPPER RET golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/contains_default.go000066400000000000000000000002451452252572700262600ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package mem func ContainsByte(haystack []byte, needle byte) bool { return containsGeneric(haystack, needle) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/contains_test.go000066400000000000000000000044541452252572700256210ustar00rootroot00000000000000package mem import ( "bytes" "testing" ) func TestContainsByte(t *testing.T) { for _, test := range []struct { name string haystack []byte needle byte expect bool }{ { name: "nil", haystack: nil, needle: 'x', expect: false, }, { name: "empty", haystack: []byte{}, needle: 'x', expect: false, }, { name: "one equal byte", haystack: []byte{'x'}, needle: 'x', expect: true, }, { name: "one unequal byte", haystack: []byte{'a'}, needle: 'x', expect: false, }, { name: "run without byte", haystack: []byte("aaaaaaaaaaaaaaaaaa"), needle: 'x', expect: false, }, { name: "run with byte prefix", haystack: []byte("xaaaaaaaaaaaaaaaaaa"), needle: 'x', expect: true, }, { name: "run with byte suffix", haystack: []byte("aaaaaaaaaaaaaaaaaax"), needle: 'x', expect: true, }, } { t.Run(test.name, func(t *testing.T) { if actual := ContainsByte(test.haystack, test.needle); actual != test.expect { t.Fatalf("ContainsByte(%v, %v) => %v", test.haystack, test.needle, actual) } }) } // Test specific lengths up to 1KB. var src []byte for i := 0; i < 1000; i++ { if ContainsByte(src, 'x') { t.Fatalf("ContainsByte(%v, 'x') => true", src) } src = append(src, 'x') if !ContainsByte(src, 'x') { t.Fatalf("ContainsByte(%v, 'x') => false", src) } src[i] = '0' } } func BenchmarkContainsByte(b *testing.B) { benchmarkContainsByte(b, ContainsByte) } func benchmarkContainsByte(b *testing.B, contains func([]byte, byte) bool) { large := bytes.Repeat([]byte{'a'}, 8*1024) b.Run("empty", benchmarkContainsByteCase(contains, nil, 'x')) b.Run("small-not-found", benchmarkContainsByteCase(contains, []byte("abcdef"), 'x')) b.Run("small-found-at-end", benchmarkContainsByteCase(contains, []byte("abcdefx"), 'x')) b.Run("large-not-found", benchmarkContainsByteCase(contains, large, 'x')) b.Run("large-found-at-end", benchmarkContainsByteCase(contains, append(large, 'x'), 'x')) } func benchmarkContainsByteCase(contains func([]byte, byte) bool, haystack []byte, needle byte) func(*testing.B) { return func(b *testing.B) { b.SetBytes(int64(len(haystack))) b.ResetTimer() for i := 0; i < b.N; i++ { contains(haystack, needle) } } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/copy.go000066400000000000000000000001161452252572700237050ustar00rootroot00000000000000package mem func copyGeneric(dst, src []byte) int { return copy(dst, src) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/copy_amd64.go000066400000000000000000000004071452252572700247030ustar00rootroot00000000000000// Code generated by command: go run copy_asm.go -pkg mem -out ../mem/copy_amd64.s -stubs ../mem/copy_amd64.go. DO NOT EDIT. //go:build !purego package mem // Copy copies src to dst, returning the number of bytes written. func Copy(dst []byte, src []byte) int golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/copy_amd64.s000066400000000000000000000050161452252572700245410ustar00rootroot00000000000000// Code generated by command: go run copy_asm.go -pkg mem -out ../mem/copy_amd64.s -stubs ../mem/copy_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func Copy(dst []byte, src []byte) int // Requires: AVX, CMOV, SSE2 TEXT ·Copy(SB), NOSPLIT, $0-56 MOVQ dst_base+0(FP), AX MOVQ src_base+24(FP), CX MOVQ dst_len+8(FP), DX MOVQ src_len+32(FP), BX CMPQ BX, DX CMOVQLT BX, DX MOVQ DX, ret+48(FP) tail: CMPQ DX, $0x00 JE done CMPQ DX, $0x01 JE handle1 CMPQ DX, $0x03 JBE handle2to3 CMPQ DX, $0x04 JE handle4 CMPQ DX, $0x08 JB handle5to7 JE handle8 CMPQ DX, $0x10 JBE handle9to16 CMPQ DX, $0x20 JBE handle17to32 CMPQ DX, $0x40 JBE handle33to64 BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCC generic CMPQ DX, $0x00000080 JB avx2_tail JMP avx2 generic: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU 32(CX), X2 MOVOU 48(CX), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) ADDQ $0x40, CX ADDQ $0x40, AX SUBQ $0x40, DX CMPQ DX, $0x40 JBE tail JMP generic done: RET handle1: MOVB (CX), CL MOVB CL, (AX) RET handle2to3: MOVW (CX), BX MOVW -2(CX)(DX*1), CX MOVW BX, (AX) MOVW CX, -2(AX)(DX*1) RET handle4: MOVL (CX), CX MOVL CX, (AX) RET handle5to7: MOVL (CX), BX MOVL -4(CX)(DX*1), CX MOVL BX, (AX) MOVL CX, -4(AX)(DX*1) RET handle8: MOVQ (CX), CX MOVQ CX, (AX) RET handle9to16: MOVQ (CX), BX MOVQ -8(CX)(DX*1), CX MOVQ BX, (AX) MOVQ CX, -8(AX)(DX*1) RET handle17to32: MOVOU (CX), X0 MOVOU -16(CX)(DX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DX*1) RET handle33to64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(DX*1), X2 MOVOU -16(CX)(DX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DX*1) MOVOU X3, -16(AX)(DX*1) RET // AVX optimized version for medium to large size inputs. avx2: VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VMOVDQU Y0, (AX) VMOVDQU Y1, 32(AX) VMOVDQU Y2, 64(AX) VMOVDQU Y3, 96(AX) ADDQ $0x00000080, CX ADDQ $0x00000080, AX SUBQ $0x00000080, DX JZ avx2_done CMPQ DX, $0x00000080 JAE avx2 avx2_tail: CMPQ DX, $0x40 JBE avx2_tail_1to64 VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU -64(CX)(DX*1), Y2 VMOVDQU -32(CX)(DX*1), Y3 VMOVDQU Y0, (AX) VMOVDQU Y1, 32(AX) VMOVDQU Y2, -64(AX)(DX*1) VMOVDQU Y3, -32(AX)(DX*1) JMP avx2_done avx2_tail_1to64: VMOVDQU -64(CX)(DX*1), Y0 VMOVDQU -32(CX)(DX*1), Y1 VMOVDQU Y0, -64(AX)(DX*1) VMOVDQU Y1, -32(AX)(DX*1) avx2_done: VZEROUPPER RET golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/copy_default.go000066400000000000000000000002031452252572700254060ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package mem func Copy(dst, src []byte) int { return copyGeneric(dst, src) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/copy_test.go000066400000000000000000000002351452252572700247460ustar00rootroot00000000000000package mem import "testing" func TestCopy(t *testing.T) { testCopy(t, Copy, copyGeneric) } func BenchmarkCopy(b *testing.B) { benchmarkCopy(b, Copy) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/count_pair.go000066400000000000000000000017041452252572700251020ustar00rootroot00000000000000package mem import ( "bytes" ) // CountPair returns the byte index of the first pair of two equal elements of // size n. // // If no pairs of equal elements were found, len(b) is returned. func CountPair(b []byte, n int) int { if len(b)%n != 0 { panic("input length is not a multiple of the item size") } // Delegate to countPair to keep the function cost low and allow the size // check to be inlined and the modulo optimized away for power of two sizes // known at compile time. return countPair(b, n) } func countPair(b []byte, n int) int { switch n { case 1: return countPair1(b) case 2: return countPair2(b) case 4: return countPair4(b) case 8: return countPair8(b) case 16: return countPair16(b) case 32: return countPair32(b) default: return countPairGeneric(b, n) } } func countPairGeneric(b []byte, n int) int { c := 0 for i := n; i < len(b); i += n { if bytes.Equal(b[i-n:i], b[i:i+n]) { c++ } } return c } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/count_pair_amd64.go000066400000000000000000000005541452252572700260770ustar00rootroot00000000000000// Code generated by command: go run count_pair_asm.go -pkg mem -out ../mem/count_pair_amd64.s -stubs ../mem/count_pair_amd64.go. DO NOT EDIT. //go:build !purego package mem func countPair1(b []byte) int func countPair2(b []byte) int func countPair4(b []byte) int func countPair8(b []byte) int func countPair16(b []byte) int func countPair32(b []byte) int golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/count_pair_amd64.s000066400000000000000000000472721452252572700257440ustar00rootroot00000000000000// Code generated by command: go run count_pair_asm.go -pkg mem -out ../mem/count_pair_amd64.s -stubs ../mem/count_pair_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func countPair1(b []byte) int // Requires: AVX, AVX2, CMOV, POPCNT TEXT ·countPair1(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX XORQ DX, DX SUBQ $0x01, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JLE done generic: MOVQ DX, BX INCQ BX MOVB (AX), SI CMPB SI, 1(AX) CMOVQEQ BX, DX ADDQ $0x01, AX SUBQ $0x01, CX CMPQ CX, $0x00 JG generic done: MOVQ DX, ret+24(FP) RET avx2: CMPQ CX, $0x00000101 JL avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 128(AX), Y8 VMOVDQU 160(AX), Y10 VMOVDQU 192(AX), Y12 VMOVDQU 224(AX), Y14 VMOVDQU 1(AX), Y1 VMOVDQU 33(AX), Y3 VMOVDQU 65(AX), Y5 VMOVDQU 97(AX), Y7 VMOVDQU 129(AX), Y9 VMOVDQU 161(AX), Y11 VMOVDQU 193(AX), Y13 VMOVDQU 225(AX), Y15 VPCMPEQB Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQB Y2, Y3, Y3 VPMOVMSKB Y3, SI VPCMPEQB Y4, Y5, Y5 VPMOVMSKB Y5, DI VPCMPEQB Y6, Y7, Y7 VPMOVMSKB Y7, R8 VPCMPEQB Y8, Y9, Y9 VPMOVMSKB Y9, R9 VPCMPEQB Y10, Y11, Y11 VPMOVMSKB Y11, R10 VPCMPEQB Y12, Y13, Y13 VPMOVMSKB Y13, R11 VPCMPEQB Y14, Y15, Y15 VPMOVMSKB Y15, R12 POPCNTQ BX, BX POPCNTQ SI, SI POPCNTQ DI, DI POPCNTQ R8, R8 POPCNTQ R9, R9 POPCNTQ R10, R10 POPCNTQ R11, R11 POPCNTQ R12, R12 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ R10, R9 ADDQ R12, R11 ADDQ R11, R9 ADDQ R9, BX ADDQ BX, DX ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000101 JGE avx2_loop256 avx2_tail128: CMPQ CX, $0x81 JL avx2_tail64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 1(AX), Y1 VMOVDQU 33(AX), Y3 VMOVDQU 65(AX), Y5 VMOVDQU 97(AX), Y7 VPCMPEQB Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQB Y2, Y3, Y3 VPMOVMSKB Y3, SI VPCMPEQB Y4, Y5, Y5 VPMOVMSKB Y5, DI VPCMPEQB Y6, Y7, Y7 VPMOVMSKB Y7, R8 POPCNTQ BX, BX POPCNTQ SI, SI POPCNTQ DI, DI POPCNTQ R8, R8 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ BX, DX ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x41 JL avx2_tail32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 1(AX), Y1 VMOVDQU 33(AX), Y3 VPCMPEQB Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQB Y2, Y3, Y3 VPMOVMSKB Y3, SI POPCNTQ BX, BX POPCNTQ SI, SI ADDQ SI, BX ADDQ BX, DX ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x21 JL avx2_tail16 VMOVDQU (AX), Y0 VMOVDQU 1(AX), Y1 VPCMPEQB Y0, Y1, Y1 VPMOVMSKB Y1, BX POPCNTQ BX, BX ADDQ BX, DX ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: CMPQ CX, $0x11 JL avx2_tail VMOVDQU (AX), X0 VMOVDQU 1(AX), X1 VPCMPEQB X0, X1, X1 VPMOVMSKB X1, BX POPCNTQ BX, BX ADDQ BX, DX ADDQ $0x10, AX SUBQ $0x10, CX avx2_tail: VZEROUPPER JMP tail // func countPair2(b []byte) int // Requires: AVX, AVX2, CMOV, POPCNT TEXT ·countPair2(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX XORQ DX, DX SUBQ $0x02, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JLE done generic: MOVQ DX, BX INCQ BX MOVW (AX), SI CMPW SI, 2(AX) CMOVQEQ BX, DX ADDQ $0x02, AX SUBQ $0x02, CX CMPQ CX, $0x00 JG generic done: MOVQ DX, ret+24(FP) RET avx2: CMPQ CX, $0x00000102 JL avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 128(AX), Y8 VMOVDQU 160(AX), Y10 VMOVDQU 192(AX), Y12 VMOVDQU 224(AX), Y14 VMOVDQU 2(AX), Y1 VMOVDQU 34(AX), Y3 VMOVDQU 66(AX), Y5 VMOVDQU 98(AX), Y7 VMOVDQU 130(AX), Y9 VMOVDQU 162(AX), Y11 VMOVDQU 194(AX), Y13 VMOVDQU 226(AX), Y15 VPCMPEQW Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQW Y2, Y3, Y3 VPMOVMSKB Y3, SI VPCMPEQW Y4, Y5, Y5 VPMOVMSKB Y5, DI VPCMPEQW Y6, Y7, Y7 VPMOVMSKB Y7, R8 VPCMPEQW Y8, Y9, Y9 VPMOVMSKB Y9, R9 VPCMPEQW Y10, Y11, Y11 VPMOVMSKB Y11, R10 VPCMPEQW Y12, Y13, Y13 VPMOVMSKB Y13, R11 VPCMPEQW Y14, Y15, Y15 VPMOVMSKB Y15, R12 POPCNTQ BX, BX POPCNTQ SI, SI POPCNTQ DI, DI POPCNTQ R8, R8 POPCNTQ R9, R9 POPCNTQ R10, R10 POPCNTQ R11, R11 POPCNTQ R12, R12 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ R10, R9 ADDQ R12, R11 ADDQ R11, R9 ADDQ R9, BX ADDQ BX, DX ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000102 JGE avx2_loop256 avx2_tail128: CMPQ CX, $0x82 JL avx2_tail64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 2(AX), Y1 VMOVDQU 34(AX), Y3 VMOVDQU 66(AX), Y5 VMOVDQU 98(AX), Y7 VPCMPEQW Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQW Y2, Y3, Y3 VPMOVMSKB Y3, SI VPCMPEQW Y4, Y5, Y5 VPMOVMSKB Y5, DI VPCMPEQW Y6, Y7, Y7 VPMOVMSKB Y7, R8 POPCNTQ BX, BX POPCNTQ SI, SI POPCNTQ DI, DI POPCNTQ R8, R8 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ BX, DX ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x42 JL avx2_tail32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 2(AX), Y1 VMOVDQU 34(AX), Y3 VPCMPEQW Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQW Y2, Y3, Y3 VPMOVMSKB Y3, SI POPCNTQ BX, BX POPCNTQ SI, SI ADDQ SI, BX ADDQ BX, DX ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x22 JL avx2_tail16 VMOVDQU (AX), Y0 VMOVDQU 2(AX), Y1 VPCMPEQW Y0, Y1, Y1 VPMOVMSKB Y1, BX POPCNTQ BX, BX ADDQ BX, DX ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: CMPQ CX, $0x12 JL avx2_tail VMOVDQU (AX), X0 VMOVDQU 2(AX), X1 VPCMPEQW X0, X1, X1 VPMOVMSKB X1, BX POPCNTQ BX, BX ADDQ BX, DX ADDQ $0x10, AX SUBQ $0x10, CX avx2_tail: VZEROUPPER SHRQ $0x01, DX JMP tail // func countPair4(b []byte) int // Requires: AVX, AVX2, CMOV, POPCNT TEXT ·countPair4(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX XORQ DX, DX SUBQ $0x04, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JLE done generic: MOVQ DX, BX INCQ BX MOVL (AX), SI CMPL SI, 4(AX) CMOVQEQ BX, DX ADDQ $0x04, AX SUBQ $0x04, CX CMPQ CX, $0x00 JG generic done: MOVQ DX, ret+24(FP) RET avx2: CMPQ CX, $0x00000104 JL avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 128(AX), Y8 VMOVDQU 160(AX), Y10 VMOVDQU 192(AX), Y12 VMOVDQU 224(AX), Y14 VMOVDQU 4(AX), Y1 VMOVDQU 36(AX), Y3 VMOVDQU 68(AX), Y5 VMOVDQU 100(AX), Y7 VMOVDQU 132(AX), Y9 VMOVDQU 164(AX), Y11 VMOVDQU 196(AX), Y13 VMOVDQU 228(AX), Y15 VPCMPEQD Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQD Y2, Y3, Y3 VPMOVMSKB Y3, SI VPCMPEQD Y4, Y5, Y5 VPMOVMSKB Y5, DI VPCMPEQD Y6, Y7, Y7 VPMOVMSKB Y7, R8 VPCMPEQD Y8, Y9, Y9 VPMOVMSKB Y9, R9 VPCMPEQD Y10, Y11, Y11 VPMOVMSKB Y11, R10 VPCMPEQD Y12, Y13, Y13 VPMOVMSKB Y13, R11 VPCMPEQD Y14, Y15, Y15 VPMOVMSKB Y15, R12 POPCNTQ BX, BX POPCNTQ SI, SI POPCNTQ DI, DI POPCNTQ R8, R8 POPCNTQ R9, R9 POPCNTQ R10, R10 POPCNTQ R11, R11 POPCNTQ R12, R12 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ R10, R9 ADDQ R12, R11 ADDQ R11, R9 ADDQ R9, BX ADDQ BX, DX ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000104 JGE avx2_loop256 avx2_tail128: CMPQ CX, $0x84 JL avx2_tail64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 4(AX), Y1 VMOVDQU 36(AX), Y3 VMOVDQU 68(AX), Y5 VMOVDQU 100(AX), Y7 VPCMPEQD Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQD Y2, Y3, Y3 VPMOVMSKB Y3, SI VPCMPEQD Y4, Y5, Y5 VPMOVMSKB Y5, DI VPCMPEQD Y6, Y7, Y7 VPMOVMSKB Y7, R8 POPCNTQ BX, BX POPCNTQ SI, SI POPCNTQ DI, DI POPCNTQ R8, R8 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ BX, DX ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x44 JL avx2_tail32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 4(AX), Y1 VMOVDQU 36(AX), Y3 VPCMPEQD Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQD Y2, Y3, Y3 VPMOVMSKB Y3, SI POPCNTQ BX, BX POPCNTQ SI, SI ADDQ SI, BX ADDQ BX, DX ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x24 JL avx2_tail16 VMOVDQU (AX), Y0 VMOVDQU 4(AX), Y1 VPCMPEQD Y0, Y1, Y1 VPMOVMSKB Y1, BX POPCNTQ BX, BX ADDQ BX, DX ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: CMPQ CX, $0x14 JL avx2_tail VMOVDQU (AX), X0 VMOVDQU 4(AX), X1 VPCMPEQD X0, X1, X1 VPMOVMSKB X1, BX POPCNTQ BX, BX ADDQ BX, DX ADDQ $0x10, AX SUBQ $0x10, CX avx2_tail: VZEROUPPER SHRQ $0x02, DX JMP tail // func countPair8(b []byte) int // Requires: AVX, AVX2, CMOV, POPCNT TEXT ·countPair8(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX XORQ DX, DX SUBQ $0x08, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JLE done generic: MOVQ DX, BX INCQ BX MOVQ (AX), SI CMPQ SI, 8(AX) CMOVQEQ BX, DX ADDQ $0x08, AX SUBQ $0x08, CX CMPQ CX, $0x00 JG generic done: MOVQ DX, ret+24(FP) RET avx2: CMPQ CX, $0x00000108 JL avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 128(AX), Y8 VMOVDQU 160(AX), Y10 VMOVDQU 192(AX), Y12 VMOVDQU 224(AX), Y14 VMOVDQU 8(AX), Y1 VMOVDQU 40(AX), Y3 VMOVDQU 72(AX), Y5 VMOVDQU 104(AX), Y7 VMOVDQU 136(AX), Y9 VMOVDQU 168(AX), Y11 VMOVDQU 200(AX), Y13 VMOVDQU 232(AX), Y15 VPCMPEQQ Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQQ Y2, Y3, Y3 VPMOVMSKB Y3, SI VPCMPEQQ Y4, Y5, Y5 VPMOVMSKB Y5, DI VPCMPEQQ Y6, Y7, Y7 VPMOVMSKB Y7, R8 VPCMPEQQ Y8, Y9, Y9 VPMOVMSKB Y9, R9 VPCMPEQQ Y10, Y11, Y11 VPMOVMSKB Y11, R10 VPCMPEQQ Y12, Y13, Y13 VPMOVMSKB Y13, R11 VPCMPEQQ Y14, Y15, Y15 VPMOVMSKB Y15, R12 POPCNTQ BX, BX POPCNTQ SI, SI POPCNTQ DI, DI POPCNTQ R8, R8 POPCNTQ R9, R9 POPCNTQ R10, R10 POPCNTQ R11, R11 POPCNTQ R12, R12 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ R10, R9 ADDQ R12, R11 ADDQ R11, R9 ADDQ R9, BX ADDQ BX, DX ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000108 JGE avx2_loop256 avx2_tail128: CMPQ CX, $0x88 JL avx2_tail64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 8(AX), Y1 VMOVDQU 40(AX), Y3 VMOVDQU 72(AX), Y5 VMOVDQU 104(AX), Y7 VPCMPEQQ Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQQ Y2, Y3, Y3 VPMOVMSKB Y3, SI VPCMPEQQ Y4, Y5, Y5 VPMOVMSKB Y5, DI VPCMPEQQ Y6, Y7, Y7 VPMOVMSKB Y7, R8 POPCNTQ BX, BX POPCNTQ SI, SI POPCNTQ DI, DI POPCNTQ R8, R8 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ BX, DX ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x48 JL avx2_tail32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 8(AX), Y1 VMOVDQU 40(AX), Y3 VPCMPEQQ Y0, Y1, Y1 VPMOVMSKB Y1, BX VPCMPEQQ Y2, Y3, Y3 VPMOVMSKB Y3, SI POPCNTQ BX, BX POPCNTQ SI, SI ADDQ SI, BX ADDQ BX, DX ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x28 JL avx2_tail16 VMOVDQU (AX), Y0 VMOVDQU 8(AX), Y1 VPCMPEQQ Y0, Y1, Y1 VPMOVMSKB Y1, BX POPCNTQ BX, BX ADDQ BX, DX ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: CMPQ CX, $0x18 JL avx2_tail VMOVDQU (AX), X0 VMOVDQU 8(AX), X1 VPCMPEQQ X0, X1, X1 VPMOVMSKB X1, BX POPCNTQ BX, BX ADDQ BX, DX ADDQ $0x10, AX SUBQ $0x10, CX avx2_tail: VZEROUPPER SHRQ $0x03, DX JMP tail // func countPair16(b []byte) int // Requires: AVX, AVX2, CMOV, POPCNT, SSE2, SSE4.1 TEXT ·countPair16(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX XORQ DX, DX SUBQ $0x10, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JLE done generic: MOVQ DX, BX INCQ BX MOVOU (AX), X0 MOVOU 16(AX), X1 PCMPEQQ X0, X1 PMOVMSKB X1, SI CMPL SI, $0x0000ffff CMOVQEQ BX, DX ADDQ $0x10, AX SUBQ $0x10, CX CMPQ CX, $0x00 JG generic done: MOVQ DX, ret+24(FP) RET avx2: CMPQ CX, $0x00000110 JL avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 128(AX), Y8 VMOVDQU 160(AX), Y10 VMOVDQU 192(AX), Y12 VMOVDQU 224(AX), Y14 VPERM2I128 $0x21, Y2, Y0, Y1 VPERM2I128 $0x21, Y4, Y2, Y3 VPERM2I128 $0x21, Y6, Y4, Y5 VPERM2I128 $0x21, Y8, Y6, Y7 VPERM2I128 $0x21, Y10, Y8, Y9 VPERM2I128 $0x21, Y12, Y10, Y11 VPERM2I128 $0x21, Y14, Y12, Y13 VMOVDQU 240(AX), Y15 VPCMPEQQ Y0, Y1, Y1 VPERMQ $0xb1, Y1, Y0 VPAND Y1, Y0, Y0 VPMOVMSKB Y0, BX VPCMPEQQ Y2, Y3, Y3 VPERMQ $0xb1, Y3, Y2 VPAND Y3, Y2, Y2 VPMOVMSKB Y2, SI VPCMPEQQ Y4, Y5, Y5 VPERMQ $0xb1, Y5, Y4 VPAND Y5, Y4, Y4 VPMOVMSKB Y4, DI VPCMPEQQ Y6, Y7, Y7 VPERMQ $0xb1, Y7, Y6 VPAND Y7, Y6, Y6 VPMOVMSKB Y6, R8 VPCMPEQQ Y8, Y9, Y9 VPERMQ $0xb1, Y9, Y8 VPAND Y9, Y8, Y8 VPMOVMSKB Y8, R9 VPCMPEQQ Y10, Y11, Y11 VPERMQ $0xb1, Y11, Y10 VPAND Y11, Y10, Y10 VPMOVMSKB Y10, R10 VPCMPEQQ Y12, Y13, Y13 VPERMQ $0xb1, Y13, Y12 VPAND Y13, Y12, Y12 VPMOVMSKB Y12, R11 VPCMPEQQ Y14, Y15, Y15 VPERMQ $0xb1, Y15, Y14 VPAND Y15, Y14, Y14 VPMOVMSKB Y14, R12 POPCNTQ BX, BX POPCNTQ SI, SI POPCNTQ DI, DI POPCNTQ R8, R8 POPCNTQ R9, R9 POPCNTQ R10, R10 POPCNTQ R11, R11 POPCNTQ R12, R12 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ R10, R9 ADDQ R12, R11 ADDQ R11, R9 ADDQ R9, BX ADDQ BX, DX ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000110 JGE avx2_loop256 avx2_tail128: CMPQ CX, $0x90 JL avx2_tail64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VPERM2I128 $0x21, Y2, Y0, Y1 VPERM2I128 $0x21, Y4, Y2, Y3 VPERM2I128 $0x21, Y6, Y4, Y5 VMOVDQU 112(AX), Y7 VPCMPEQQ Y0, Y1, Y1 VPERMQ $0xb1, Y1, Y0 VPAND Y1, Y0, Y0 VPMOVMSKB Y0, BX VPCMPEQQ Y2, Y3, Y3 VPERMQ $0xb1, Y3, Y2 VPAND Y3, Y2, Y2 VPMOVMSKB Y2, SI VPCMPEQQ Y4, Y5, Y5 VPERMQ $0xb1, Y5, Y4 VPAND Y5, Y4, Y4 VPMOVMSKB Y4, DI VPCMPEQQ Y6, Y7, Y7 VPERMQ $0xb1, Y7, Y6 VPAND Y7, Y6, Y6 VPMOVMSKB Y6, R8 POPCNTQ BX, BX POPCNTQ SI, SI POPCNTQ DI, DI POPCNTQ R8, R8 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ BX, DX ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x50 JL avx2_tail32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VPERM2I128 $0x21, Y2, Y0, Y1 VMOVDQU 48(AX), Y3 VPCMPEQQ Y0, Y1, Y1 VPERMQ $0xb1, Y1, Y0 VPAND Y1, Y0, Y0 VPMOVMSKB Y0, BX VPCMPEQQ Y2, Y3, Y3 VPERMQ $0xb1, Y3, Y2 VPAND Y3, Y2, Y2 VPMOVMSKB Y2, SI POPCNTQ BX, BX POPCNTQ SI, SI ADDQ SI, BX ADDQ BX, DX ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x30 JL avx2_tail16 VMOVDQU (AX), Y0 VMOVDQU 16(AX), Y1 VPCMPEQQ Y0, Y1, Y1 VPERMQ $0xb1, Y1, Y0 VPAND Y1, Y0, Y0 VPMOVMSKB Y0, BX POPCNTQ BX, BX ADDQ BX, DX ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: VZEROUPPER SHRQ $0x04, DX JMP tail // func countPair32(b []byte) int // Requires: AVX, AVX2, CMOV, POPCNT, SSE2, SSE4.1 TEXT ·countPair32(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX XORQ DX, DX SUBQ $0x20, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JLE done generic: MOVQ DX, BX INCQ BX MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU 32(AX), X2 MOVOU 48(AX), X3 PCMPEQQ X0, X2 PCMPEQQ X1, X3 PMOVMSKB X2, SI PMOVMSKB X3, DI ANDL DI, SI CMPL SI, $0x0000ffff CMOVQEQ BX, DX ADDQ $0x20, AX SUBQ $0x20, CX CMPQ CX, $0x00 JG generic done: MOVQ DX, ret+24(FP) RET avx2: CMPQ CX, $0x00000120 JL avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y5 VMOVDQU 128(AX), Y7 VMOVDQU 160(AX), Y8 VMOVDQU 192(AX), Y9 VMOVDQU 224(AX), Y10 VMOVDQU 256(AX), Y11 VPCMPEQQ Y0, Y2, Y1 VPMOVMSKB Y1, BX VPCMPEQQ Y2, Y4, Y3 VPMOVMSKB Y3, SI VPCMPEQQ Y4, Y5, Y0 VPMOVMSKB Y0, DI VPCMPEQQ Y5, Y7, Y6 VPMOVMSKB Y6, R8 VPCMPEQQ Y7, Y8, Y0 VPMOVMSKB Y0, R9 VPCMPEQQ Y8, Y9, Y0 VPMOVMSKB Y0, R10 VPCMPEQQ Y9, Y10, Y0 VPMOVMSKB Y0, R11 VPCMPEQQ Y10, Y11, Y11 VPMOVMSKB Y11, R12 POPCNTQ BX, BX SHRQ $0x05, BX POPCNTQ SI, SI SHRQ $0x05, SI POPCNTQ DI, DI SHRQ $0x05, DI POPCNTQ R8, R8 SHRQ $0x05, R8 POPCNTQ R9, R9 SHRQ $0x05, R9 POPCNTQ R10, R10 SHRQ $0x05, R10 POPCNTQ R11, R11 SHRQ $0x05, R11 POPCNTQ R12, R12 SHRQ $0x05, R12 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ R10, R9 ADDQ R12, R11 ADDQ R11, R9 ADDQ R9, BX ADDQ BX, DX ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000120 JGE avx2_loop256 avx2_tail128: CMPQ CX, $0xa0 JL avx2_tail64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y5 VMOVDQU 128(AX), Y6 VPCMPEQQ Y0, Y2, Y1 VPMOVMSKB Y1, BX VPCMPEQQ Y2, Y4, Y3 VPMOVMSKB Y3, SI VPCMPEQQ Y4, Y5, Y0 VPMOVMSKB Y0, DI VPCMPEQQ Y5, Y6, Y6 VPMOVMSKB Y6, R8 POPCNTQ BX, BX SHRQ $0x05, BX POPCNTQ SI, SI SHRQ $0x05, SI POPCNTQ DI, DI SHRQ $0x05, DI POPCNTQ R8, R8 SHRQ $0x05, R8 ADDQ SI, BX ADDQ R8, DI ADDQ DI, BX ADDQ BX, DX ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x60 JL avx2_tail32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y3 VPCMPEQQ Y0, Y2, Y1 VPMOVMSKB Y1, BX VPCMPEQQ Y2, Y3, Y3 VPMOVMSKB Y3, SI POPCNTQ BX, BX SHRQ $0x05, BX POPCNTQ SI, SI SHRQ $0x05, SI ADDQ SI, BX ADDQ BX, DX ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x40 JL avx2_tail16 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y1 VPCMPEQQ Y0, Y1, Y1 VPMOVMSKB Y1, BX POPCNTQ BX, BX SHRQ $0x05, BX ADDQ BX, DX ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: VZEROUPPER JMP tail golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/count_pair_default.go000066400000000000000000000007211452252572700266040ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package mem func countPair1(b []byte) int { return countPairGeneric(b, 1) } func countPair2(b []byte) int { return countPairGeneric(b, 2) } func countPair4(b []byte) int { return countPairGeneric(b, 4) } func countPair8(b []byte) int { return countPairGeneric(b, 8) } func countPair16(b []byte) int { return countPairGeneric(b, 16) } func countPair32(b []byte) int { return countPairGeneric(b, 32) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/count_pair_test.go000066400000000000000000000074621452252572700261500ustar00rootroot00000000000000package mem import ( "fmt" "testing" ) var ( countPairSizes = [...]int{ 1, 2, 4, 8, 10, 16, 32, } ) func TestCountPair(t *testing.T) { for _, size := range countPairSizes { makeInput := func(values ...byte) []byte { input := make([]byte, size*len(values)) for i := range values { input[i*size] = values[i] } return input } t.Run(fmt.Sprintf("N=%d", size), func(t *testing.T) { tests := []struct { scenario string input []byte count int }{ { scenario: "empty input", input: nil, count: 0, }, { scenario: "input with only one item", input: makeInput(1), count: 0, }, { scenario: "input with two non-equal items", input: makeInput(1, 2), count: 0, }, { scenario: "input with two equal items", input: makeInput(1, 1), count: 1, }, { scenario: "input with two equal items in the middle", input: makeInput(0, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9), count: 1, }, { scenario: "input with two equal items at the end", input: makeInput(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9), count: 1, }, { scenario: "input with many equal items at the beginning of a long sequence", input: makeInput( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, ), count: 9, }, { scenario: "input with many equal items in the middle of a long sequence", input: makeInput( 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, ), count: 9, }, { scenario: "input with many equal items in a long sequence", input: makeInput( 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7, 0, ), count: 15, }, } for _, test := range tests { t.Run(test.scenario, func(t *testing.T) { n := CountPair(test.input, size) if n != test.count { t.Errorf("expected=%d found=%d", test.count, n) } }) } }) } } func BenchmarkCountPair(b *testing.B) { for _, size := range countPairSizes { input := make([]byte, 16*1024) for i := range input { input[i] = byte(i) } if size%len(input) != 0 { input = input[:(len(input)/size)*size] } b.Run(fmt.Sprintf("N=%d", size), func(b *testing.B) { b.SetBytes(int64(len(input))) for i := 0; i < b.N; i++ { n := CountPair(input, size) if n != 0 { b.Fatal("unexpected result:", n) } } }) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/index_pair.go000066400000000000000000000017021452252572700250570ustar00rootroot00000000000000package mem import ( "bytes" ) // IndexPair returns the byte index of the first pair of two equal elements of // size n. // // If no pairs of equal elements were found, -1 is returned. func IndexPair(b []byte, n int) int { if len(b)%n != 0 { panic("input length is not a multiple of the item size") } // Delegate to indexPair to keep the function cost low and allow the size // check to be inlined and the modulo optimized away for power of two sizes // known at compile time. return indexPair(b, n) } func indexPair(b []byte, n int) int { switch n { case 1: return indexPair1(b) case 2: return indexPair2(b) case 4: return indexPair4(b) case 8: return indexPair8(b) case 16: return indexPair16(b) case 32: return indexPair32(b) default: return indexPairGeneric(b, n) } } func indexPairGeneric(b []byte, n int) int { for i := n; i < len(b); i += n { if bytes.Equal(b[i-n:i], b[i:i+n]) { return i - n } } return -1 } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/index_pair_amd64.go000066400000000000000000000005541452252572700260560ustar00rootroot00000000000000// Code generated by command: go run index_pair_asm.go -pkg mem -out ../mem/index_pair_amd64.s -stubs ../mem/index_pair_amd64.go. DO NOT EDIT. //go:build !purego package mem func indexPair1(b []byte) int func indexPair2(b []byte) int func indexPair4(b []byte) int func indexPair8(b []byte) int func indexPair16(b []byte) int func indexPair32(b []byte) int golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/index_pair_amd64.s000066400000000000000000000662371452252572700257250ustar00rootroot00000000000000// Code generated by command: go run index_pair_asm.go -pkg mem -out ../mem/index_pair_amd64.s -stubs ../mem/index_pair_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func indexPair1(b []byte) int // Requires: AVX, AVX2, BMI TEXT ·indexPair1(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX MOVQ AX, DX CMPQ CX, $0x00 JLE fail SUBQ $0x01, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JE fail generic: MOVB (AX), BL CMPB BL, 1(AX) JE done ADDQ $0x01, AX SUBQ $0x01, CX CMPQ CX, $0x00 JA generic fail: MOVQ $0xffffffffffffffff, AX MOVQ AX, ret+24(FP) RET done: SUBQ DX, AX MOVQ AX, ret+24(FP) RET avx2: CMPQ CX, $0x00000101 JB avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 128(AX), Y8 VMOVDQU 160(AX), Y10 VMOVDQU 192(AX), Y12 VMOVDQU 224(AX), Y14 VMOVDQU 1(AX), Y1 VMOVDQU 33(AX), Y3 VMOVDQU 65(AX), Y5 VMOVDQU 97(AX), Y7 VMOVDQU 129(AX), Y9 VMOVDQU 161(AX), Y11 VMOVDQU 193(AX), Y13 VMOVDQU 225(AX), Y15 VPCMPEQB Y0, Y1, Y1 VPCMPEQB Y2, Y3, Y3 VPCMPEQB Y4, Y5, Y5 VPCMPEQB Y6, Y7, Y7 VPCMPEQB Y8, Y9, Y9 VPCMPEQB Y10, Y11, Y11 VPCMPEQB Y12, Y13, Y13 VPCMPEQB Y14, Y15, Y15 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI VPMOVMSKB Y5, DI VPMOVMSKB Y7, R8 VPMOVMSKB Y9, R9 VPMOVMSKB Y11, R10 VPMOVMSKB Y13, R11 VPMOVMSKB Y15, R12 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 ORQ R9, R13 ORQ R10, R13 ORQ R11, R13 ORQ R12, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000101 JAE avx2_loop256 avx2_tail128: CMPQ CX, $0x81 JB avx2_tail64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 1(AX), Y1 VMOVDQU 33(AX), Y3 VMOVDQU 65(AX), Y5 VMOVDQU 97(AX), Y7 VPCMPEQB Y0, Y1, Y1 VPCMPEQB Y2, Y3, Y3 VPCMPEQB Y4, Y5, Y5 VPCMPEQB Y6, Y7, Y7 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI VPMOVMSKB Y5, DI VPMOVMSKB Y7, R8 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x41 JB avx2_tail32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 1(AX), Y1 VMOVDQU 33(AX), Y3 VPCMPEQB Y0, Y1, Y1 VPCMPEQB Y2, Y3, Y3 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x21 JB avx2_tail16 VMOVDQU (AX), Y0 VMOVDQU 1(AX), Y1 VPCMPEQB Y0, Y1, Y1 VPMOVMSKB Y1, BX CMPQ BX, $0x00 JNE avx2_done ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: CMPQ CX, $0x11 JB avx2_tail VMOVDQU (AX), X0 VMOVDQU 1(AX), X1 VPCMPEQB X0, X1, X1 VPMOVMSKB X1, BX CMPQ BX, $0x00 JNE avx2_done ADDQ $0x10, AX SUBQ $0x10, CX avx2_tail: VZEROUPPER JMP tail avx2_done: VZEROUPPER CMPQ BX, $0x00 JNE avx2_done0 CMPQ SI, $0x00 JNE avx2_done1 CMPQ DI, $0x00 JNE avx2_done2 CMPQ R8, $0x00 JNE avx2_done3 CMPQ R9, $0x00 JNE avx2_done4 CMPQ R10, $0x00 JNE avx2_done5 CMPQ R11, $0x00 JNE avx2_done6 CMPQ R12, $0x00 JNE avx2_done7 avx2_done0: TZCNTQ BX, BX ADDQ BX, AX SUBQ BX, CX JMP done avx2_done1: ADDQ $0x00000020, AX SUBQ $0x00000020, CX TZCNTQ SI, SI ADDQ SI, AX SUBQ SI, CX JMP done avx2_done2: ADDQ $0x00000040, AX SUBQ $0x00000040, CX TZCNTQ DI, DI ADDQ DI, AX SUBQ DI, CX JMP done avx2_done3: ADDQ $0x00000060, AX SUBQ $0x00000060, CX TZCNTQ R8, R8 ADDQ R8, AX SUBQ R8, CX JMP done avx2_done4: ADDQ $0x00000080, AX SUBQ $0x00000080, CX TZCNTQ R9, R9 ADDQ R9, AX SUBQ R9, CX JMP done avx2_done5: ADDQ $0x000000a0, AX SUBQ $0x000000a0, CX TZCNTQ R10, R10 ADDQ R10, AX SUBQ R10, CX JMP done avx2_done6: ADDQ $0x000000c0, AX SUBQ $0x000000c0, CX TZCNTQ R11, R11 ADDQ R11, AX SUBQ R11, CX JMP done avx2_done7: ADDQ $0x000000e0, AX SUBQ $0x000000e0, CX TZCNTQ R12, R12 ADDQ R12, AX SUBQ R12, CX JMP done // func indexPair2(b []byte) int // Requires: AVX, AVX2, BMI TEXT ·indexPair2(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX MOVQ AX, DX CMPQ CX, $0x00 JLE fail SUBQ $0x02, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JE fail generic: MOVW (AX), BX CMPW BX, 2(AX) JE done ADDQ $0x02, AX SUBQ $0x02, CX CMPQ CX, $0x00 JA generic fail: MOVQ $0xffffffffffffffff, AX MOVQ AX, ret+24(FP) RET done: SUBQ DX, AX MOVQ AX, ret+24(FP) RET avx2: CMPQ CX, $0x00000102 JB avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 128(AX), Y8 VMOVDQU 160(AX), Y10 VMOVDQU 192(AX), Y12 VMOVDQU 224(AX), Y14 VMOVDQU 2(AX), Y1 VMOVDQU 34(AX), Y3 VMOVDQU 66(AX), Y5 VMOVDQU 98(AX), Y7 VMOVDQU 130(AX), Y9 VMOVDQU 162(AX), Y11 VMOVDQU 194(AX), Y13 VMOVDQU 226(AX), Y15 VPCMPEQW Y0, Y1, Y1 VPCMPEQW Y2, Y3, Y3 VPCMPEQW Y4, Y5, Y5 VPCMPEQW Y6, Y7, Y7 VPCMPEQW Y8, Y9, Y9 VPCMPEQW Y10, Y11, Y11 VPCMPEQW Y12, Y13, Y13 VPCMPEQW Y14, Y15, Y15 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI VPMOVMSKB Y5, DI VPMOVMSKB Y7, R8 VPMOVMSKB Y9, R9 VPMOVMSKB Y11, R10 VPMOVMSKB Y13, R11 VPMOVMSKB Y15, R12 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 ORQ R9, R13 ORQ R10, R13 ORQ R11, R13 ORQ R12, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000102 JAE avx2_loop256 avx2_tail128: CMPQ CX, $0x82 JB avx2_tail64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 2(AX), Y1 VMOVDQU 34(AX), Y3 VMOVDQU 66(AX), Y5 VMOVDQU 98(AX), Y7 VPCMPEQW Y0, Y1, Y1 VPCMPEQW Y2, Y3, Y3 VPCMPEQW Y4, Y5, Y5 VPCMPEQW Y6, Y7, Y7 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI VPMOVMSKB Y5, DI VPMOVMSKB Y7, R8 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x42 JB avx2_tail32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 2(AX), Y1 VMOVDQU 34(AX), Y3 VPCMPEQW Y0, Y1, Y1 VPCMPEQW Y2, Y3, Y3 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x22 JB avx2_tail16 VMOVDQU (AX), Y0 VMOVDQU 2(AX), Y1 VPCMPEQW Y0, Y1, Y1 VPMOVMSKB Y1, BX CMPQ BX, $0x00 JNE avx2_done ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: CMPQ CX, $0x12 JB avx2_tail VMOVDQU (AX), X0 VMOVDQU 2(AX), X1 VPCMPEQW X0, X1, X1 VPMOVMSKB X1, BX CMPQ BX, $0x00 JNE avx2_done ADDQ $0x10, AX SUBQ $0x10, CX avx2_tail: VZEROUPPER JMP tail avx2_done: VZEROUPPER CMPQ BX, $0x00 JNE avx2_done0 CMPQ SI, $0x00 JNE avx2_done1 CMPQ DI, $0x00 JNE avx2_done2 CMPQ R8, $0x00 JNE avx2_done3 CMPQ R9, $0x00 JNE avx2_done4 CMPQ R10, $0x00 JNE avx2_done5 CMPQ R11, $0x00 JNE avx2_done6 CMPQ R12, $0x00 JNE avx2_done7 avx2_done0: TZCNTQ BX, BX ADDQ BX, AX SUBQ BX, CX JMP done avx2_done1: ADDQ $0x00000020, AX SUBQ $0x00000020, CX TZCNTQ SI, SI ADDQ SI, AX SUBQ SI, CX JMP done avx2_done2: ADDQ $0x00000040, AX SUBQ $0x00000040, CX TZCNTQ DI, DI ADDQ DI, AX SUBQ DI, CX JMP done avx2_done3: ADDQ $0x00000060, AX SUBQ $0x00000060, CX TZCNTQ R8, R8 ADDQ R8, AX SUBQ R8, CX JMP done avx2_done4: ADDQ $0x00000080, AX SUBQ $0x00000080, CX TZCNTQ R9, R9 ADDQ R9, AX SUBQ R9, CX JMP done avx2_done5: ADDQ $0x000000a0, AX SUBQ $0x000000a0, CX TZCNTQ R10, R10 ADDQ R10, AX SUBQ R10, CX JMP done avx2_done6: ADDQ $0x000000c0, AX SUBQ $0x000000c0, CX TZCNTQ R11, R11 ADDQ R11, AX SUBQ R11, CX JMP done avx2_done7: ADDQ $0x000000e0, AX SUBQ $0x000000e0, CX TZCNTQ R12, R12 ADDQ R12, AX SUBQ R12, CX JMP done // func indexPair4(b []byte) int // Requires: AVX, AVX2, BMI TEXT ·indexPair4(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX MOVQ AX, DX CMPQ CX, $0x00 JLE fail SUBQ $0x04, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JE fail generic: MOVL (AX), BX CMPL BX, 4(AX) JE done ADDQ $0x04, AX SUBQ $0x04, CX CMPQ CX, $0x00 JA generic fail: MOVQ $0xffffffffffffffff, AX MOVQ AX, ret+24(FP) RET done: SUBQ DX, AX MOVQ AX, ret+24(FP) RET avx2: CMPQ CX, $0x00000104 JB avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 128(AX), Y8 VMOVDQU 160(AX), Y10 VMOVDQU 192(AX), Y12 VMOVDQU 224(AX), Y14 VMOVDQU 4(AX), Y1 VMOVDQU 36(AX), Y3 VMOVDQU 68(AX), Y5 VMOVDQU 100(AX), Y7 VMOVDQU 132(AX), Y9 VMOVDQU 164(AX), Y11 VMOVDQU 196(AX), Y13 VMOVDQU 228(AX), Y15 VPCMPEQD Y0, Y1, Y1 VPCMPEQD Y2, Y3, Y3 VPCMPEQD Y4, Y5, Y5 VPCMPEQD Y6, Y7, Y7 VPCMPEQD Y8, Y9, Y9 VPCMPEQD Y10, Y11, Y11 VPCMPEQD Y12, Y13, Y13 VPCMPEQD Y14, Y15, Y15 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI VPMOVMSKB Y5, DI VPMOVMSKB Y7, R8 VPMOVMSKB Y9, R9 VPMOVMSKB Y11, R10 VPMOVMSKB Y13, R11 VPMOVMSKB Y15, R12 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 ORQ R9, R13 ORQ R10, R13 ORQ R11, R13 ORQ R12, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000104 JAE avx2_loop256 avx2_tail128: CMPQ CX, $0x84 JB avx2_tail64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 4(AX), Y1 VMOVDQU 36(AX), Y3 VMOVDQU 68(AX), Y5 VMOVDQU 100(AX), Y7 VPCMPEQD Y0, Y1, Y1 VPCMPEQD Y2, Y3, Y3 VPCMPEQD Y4, Y5, Y5 VPCMPEQD Y6, Y7, Y7 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI VPMOVMSKB Y5, DI VPMOVMSKB Y7, R8 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x44 JB avx2_tail32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 4(AX), Y1 VMOVDQU 36(AX), Y3 VPCMPEQD Y0, Y1, Y1 VPCMPEQD Y2, Y3, Y3 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x24 JB avx2_tail16 VMOVDQU (AX), Y0 VMOVDQU 4(AX), Y1 VPCMPEQD Y0, Y1, Y1 VPMOVMSKB Y1, BX CMPQ BX, $0x00 JNE avx2_done ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: CMPQ CX, $0x14 JB avx2_tail VMOVDQU (AX), X0 VMOVDQU 4(AX), X1 VPCMPEQD X0, X1, X1 VPMOVMSKB X1, BX CMPQ BX, $0x00 JNE avx2_done ADDQ $0x10, AX SUBQ $0x10, CX avx2_tail: VZEROUPPER JMP tail avx2_done: VZEROUPPER CMPQ BX, $0x00 JNE avx2_done0 CMPQ SI, $0x00 JNE avx2_done1 CMPQ DI, $0x00 JNE avx2_done2 CMPQ R8, $0x00 JNE avx2_done3 CMPQ R9, $0x00 JNE avx2_done4 CMPQ R10, $0x00 JNE avx2_done5 CMPQ R11, $0x00 JNE avx2_done6 CMPQ R12, $0x00 JNE avx2_done7 avx2_done0: TZCNTQ BX, BX ADDQ BX, AX SUBQ BX, CX JMP done avx2_done1: ADDQ $0x00000020, AX SUBQ $0x00000020, CX TZCNTQ SI, SI ADDQ SI, AX SUBQ SI, CX JMP done avx2_done2: ADDQ $0x00000040, AX SUBQ $0x00000040, CX TZCNTQ DI, DI ADDQ DI, AX SUBQ DI, CX JMP done avx2_done3: ADDQ $0x00000060, AX SUBQ $0x00000060, CX TZCNTQ R8, R8 ADDQ R8, AX SUBQ R8, CX JMP done avx2_done4: ADDQ $0x00000080, AX SUBQ $0x00000080, CX TZCNTQ R9, R9 ADDQ R9, AX SUBQ R9, CX JMP done avx2_done5: ADDQ $0x000000a0, AX SUBQ $0x000000a0, CX TZCNTQ R10, R10 ADDQ R10, AX SUBQ R10, CX JMP done avx2_done6: ADDQ $0x000000c0, AX SUBQ $0x000000c0, CX TZCNTQ R11, R11 ADDQ R11, AX SUBQ R11, CX JMP done avx2_done7: ADDQ $0x000000e0, AX SUBQ $0x000000e0, CX TZCNTQ R12, R12 ADDQ R12, AX SUBQ R12, CX JMP done // func indexPair8(b []byte) int // Requires: AVX, AVX2, BMI TEXT ·indexPair8(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX MOVQ AX, DX CMPQ CX, $0x00 JLE fail SUBQ $0x08, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JE fail generic: MOVQ (AX), BX CMPQ BX, 8(AX) JE done ADDQ $0x08, AX SUBQ $0x08, CX CMPQ CX, $0x00 JA generic fail: MOVQ $0xffffffffffffffff, AX MOVQ AX, ret+24(FP) RET done: SUBQ DX, AX MOVQ AX, ret+24(FP) RET avx2: CMPQ CX, $0x00000108 JB avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 128(AX), Y8 VMOVDQU 160(AX), Y10 VMOVDQU 192(AX), Y12 VMOVDQU 224(AX), Y14 VMOVDQU 8(AX), Y1 VMOVDQU 40(AX), Y3 VMOVDQU 72(AX), Y5 VMOVDQU 104(AX), Y7 VMOVDQU 136(AX), Y9 VMOVDQU 168(AX), Y11 VMOVDQU 200(AX), Y13 VMOVDQU 232(AX), Y15 VPCMPEQQ Y0, Y1, Y1 VPCMPEQQ Y2, Y3, Y3 VPCMPEQQ Y4, Y5, Y5 VPCMPEQQ Y6, Y7, Y7 VPCMPEQQ Y8, Y9, Y9 VPCMPEQQ Y10, Y11, Y11 VPCMPEQQ Y12, Y13, Y13 VPCMPEQQ Y14, Y15, Y15 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI VPMOVMSKB Y5, DI VPMOVMSKB Y7, R8 VPMOVMSKB Y9, R9 VPMOVMSKB Y11, R10 VPMOVMSKB Y13, R11 VPMOVMSKB Y15, R12 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 ORQ R9, R13 ORQ R10, R13 ORQ R11, R13 ORQ R12, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000108 JAE avx2_loop256 avx2_tail128: CMPQ CX, $0x88 JB avx2_tail64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 8(AX), Y1 VMOVDQU 40(AX), Y3 VMOVDQU 72(AX), Y5 VMOVDQU 104(AX), Y7 VPCMPEQQ Y0, Y1, Y1 VPCMPEQQ Y2, Y3, Y3 VPCMPEQQ Y4, Y5, Y5 VPCMPEQQ Y6, Y7, Y7 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI VPMOVMSKB Y5, DI VPMOVMSKB Y7, R8 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x48 JB avx2_tail32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 8(AX), Y1 VMOVDQU 40(AX), Y3 VPCMPEQQ Y0, Y1, Y1 VPCMPEQQ Y2, Y3, Y3 VPMOVMSKB Y1, BX VPMOVMSKB Y3, SI XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x28 JB avx2_tail16 VMOVDQU (AX), Y0 VMOVDQU 8(AX), Y1 VPCMPEQQ Y0, Y1, Y1 VPMOVMSKB Y1, BX CMPQ BX, $0x00 JNE avx2_done ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: CMPQ CX, $0x18 JB avx2_tail VMOVDQU (AX), X0 VMOVDQU 8(AX), X1 VPCMPEQQ X0, X1, X1 VPMOVMSKB X1, BX CMPQ BX, $0x00 JNE avx2_done ADDQ $0x10, AX SUBQ $0x10, CX avx2_tail: VZEROUPPER JMP tail avx2_done: VZEROUPPER CMPQ BX, $0x00 JNE avx2_done0 CMPQ SI, $0x00 JNE avx2_done1 CMPQ DI, $0x00 JNE avx2_done2 CMPQ R8, $0x00 JNE avx2_done3 CMPQ R9, $0x00 JNE avx2_done4 CMPQ R10, $0x00 JNE avx2_done5 CMPQ R11, $0x00 JNE avx2_done6 CMPQ R12, $0x00 JNE avx2_done7 avx2_done0: TZCNTQ BX, BX ADDQ BX, AX SUBQ BX, CX JMP done avx2_done1: ADDQ $0x00000020, AX SUBQ $0x00000020, CX TZCNTQ SI, SI ADDQ SI, AX SUBQ SI, CX JMP done avx2_done2: ADDQ $0x00000040, AX SUBQ $0x00000040, CX TZCNTQ DI, DI ADDQ DI, AX SUBQ DI, CX JMP done avx2_done3: ADDQ $0x00000060, AX SUBQ $0x00000060, CX TZCNTQ R8, R8 ADDQ R8, AX SUBQ R8, CX JMP done avx2_done4: ADDQ $0x00000080, AX SUBQ $0x00000080, CX TZCNTQ R9, R9 ADDQ R9, AX SUBQ R9, CX JMP done avx2_done5: ADDQ $0x000000a0, AX SUBQ $0x000000a0, CX TZCNTQ R10, R10 ADDQ R10, AX SUBQ R10, CX JMP done avx2_done6: ADDQ $0x000000c0, AX SUBQ $0x000000c0, CX TZCNTQ R11, R11 ADDQ R11, AX SUBQ R11, CX JMP done avx2_done7: ADDQ $0x000000e0, AX SUBQ $0x000000e0, CX TZCNTQ R12, R12 ADDQ R12, AX SUBQ R12, CX JMP done // func indexPair16(b []byte) int // Requires: AVX, AVX2, BMI, SSE2, SSE4.1 TEXT ·indexPair16(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX MOVQ AX, DX CMPQ CX, $0x00 JLE fail SUBQ $0x10, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JE fail generic: MOVOU (AX), X0 MOVOU 16(AX), X1 PCMPEQQ X0, X1 PMOVMSKB X1, BX CMPL BX, $0x0000ffff JE done ADDQ $0x10, AX SUBQ $0x10, CX CMPQ CX, $0x00 JA generic fail: MOVQ $0xffffffffffffffff, AX MOVQ AX, ret+24(FP) RET done: SUBQ DX, AX MOVQ AX, ret+24(FP) RET avx2: CMPQ CX, $0x00000110 JB avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VMOVDQU 128(AX), Y8 VMOVDQU 160(AX), Y10 VMOVDQU 192(AX), Y12 VMOVDQU 224(AX), Y14 VPERM2I128 $0x21, Y2, Y0, Y1 VPERM2I128 $0x21, Y4, Y2, Y3 VPERM2I128 $0x21, Y6, Y4, Y5 VPERM2I128 $0x21, Y8, Y6, Y7 VPERM2I128 $0x21, Y10, Y8, Y9 VPERM2I128 $0x21, Y12, Y10, Y11 VPERM2I128 $0x21, Y14, Y12, Y13 VMOVDQU 240(AX), Y15 VPCMPEQQ Y0, Y1, Y1 VPCMPEQQ Y2, Y3, Y3 VPCMPEQQ Y4, Y5, Y5 VPCMPEQQ Y6, Y7, Y7 VPCMPEQQ Y8, Y9, Y9 VPCMPEQQ Y10, Y11, Y11 VPCMPEQQ Y12, Y13, Y13 VPCMPEQQ Y14, Y15, Y15 VPERMQ $0xb1, Y1, Y0 VPAND Y1, Y0, Y0 VPMOVMSKB Y0, BX VPERMQ $0xb1, Y3, Y2 VPAND Y3, Y2, Y2 VPMOVMSKB Y2, SI VPERMQ $0xb1, Y5, Y4 VPAND Y5, Y4, Y4 VPMOVMSKB Y4, DI VPERMQ $0xb1, Y7, Y6 VPAND Y7, Y6, Y6 VPMOVMSKB Y6, R8 VPERMQ $0xb1, Y9, Y8 VPAND Y9, Y8, Y8 VPMOVMSKB Y8, R9 VPERMQ $0xb1, Y11, Y10 VPAND Y11, Y10, Y10 VPMOVMSKB Y10, R10 VPERMQ $0xb1, Y13, Y12 VPAND Y13, Y12, Y12 VPMOVMSKB Y12, R11 VPERMQ $0xb1, Y15, Y14 VPAND Y15, Y14, Y14 VPMOVMSKB Y14, R12 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 ORQ R9, R13 ORQ R10, R13 ORQ R11, R13 ORQ R12, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000110 JAE avx2_loop256 avx2_tail128: CMPQ CX, $0x90 JB avx2_tail64 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y6 VPERM2I128 $0x21, Y2, Y0, Y1 VPERM2I128 $0x21, Y4, Y2, Y3 VPERM2I128 $0x21, Y6, Y4, Y5 VMOVDQU 112(AX), Y7 VPCMPEQQ Y0, Y1, Y1 VPCMPEQQ Y2, Y3, Y3 VPCMPEQQ Y4, Y5, Y5 VPCMPEQQ Y6, Y7, Y7 VPERMQ $0xb1, Y1, Y0 VPAND Y1, Y0, Y0 VPMOVMSKB Y0, BX VPERMQ $0xb1, Y3, Y2 VPAND Y3, Y2, Y2 VPMOVMSKB Y2, SI VPERMQ $0xb1, Y5, Y4 VPAND Y5, Y4, Y4 VPMOVMSKB Y4, DI VPERMQ $0xb1, Y7, Y6 VPAND Y7, Y6, Y6 VPMOVMSKB Y6, R8 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x50 JB avx2_tail32 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VPERM2I128 $0x21, Y2, Y0, Y1 VMOVDQU 48(AX), Y3 VPCMPEQQ Y0, Y1, Y1 VPCMPEQQ Y2, Y3, Y3 VPERMQ $0xb1, Y1, Y0 VPAND Y1, Y0, Y0 VPMOVMSKB Y0, BX VPERMQ $0xb1, Y3, Y2 VPAND Y3, Y2, Y2 VPMOVMSKB Y2, SI XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x30 JB avx2_tail16 VMOVDQU (AX), Y0 VMOVDQU 16(AX), Y1 VPCMPEQQ Y0, Y1, Y1 VPERMQ $0xb1, Y1, Y0 VPAND Y1, Y0, Y0 VPMOVMSKB Y0, BX CMPQ BX, $0x00 JNE avx2_done ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: VZEROUPPER JMP tail avx2_done: VZEROUPPER CMPQ BX, $0x00 JNE avx2_done0 CMPQ SI, $0x00 JNE avx2_done1 CMPQ DI, $0x00 JNE avx2_done2 CMPQ R8, $0x00 JNE avx2_done3 CMPQ R9, $0x00 JNE avx2_done4 CMPQ R10, $0x00 JNE avx2_done5 CMPQ R11, $0x00 JNE avx2_done6 CMPQ R12, $0x00 JNE avx2_done7 avx2_done0: TZCNTQ BX, BX ADDQ BX, AX SUBQ BX, CX JMP done avx2_done1: ADDQ $0x00000020, AX SUBQ $0x00000020, CX TZCNTQ SI, SI ADDQ SI, AX SUBQ SI, CX JMP done avx2_done2: ADDQ $0x00000040, AX SUBQ $0x00000040, CX TZCNTQ DI, DI ADDQ DI, AX SUBQ DI, CX JMP done avx2_done3: ADDQ $0x00000060, AX SUBQ $0x00000060, CX TZCNTQ R8, R8 ADDQ R8, AX SUBQ R8, CX JMP done avx2_done4: ADDQ $0x00000080, AX SUBQ $0x00000080, CX TZCNTQ R9, R9 ADDQ R9, AX SUBQ R9, CX JMP done avx2_done5: ADDQ $0x000000a0, AX SUBQ $0x000000a0, CX TZCNTQ R10, R10 ADDQ R10, AX SUBQ R10, CX JMP done avx2_done6: ADDQ $0x000000c0, AX SUBQ $0x000000c0, CX TZCNTQ R11, R11 ADDQ R11, AX SUBQ R11, CX JMP done avx2_done7: ADDQ $0x000000e0, AX SUBQ $0x000000e0, CX TZCNTQ R12, R12 ADDQ R12, AX SUBQ R12, CX JMP done // func indexPair32(b []byte) int // Requires: AVX, AVX2, BMI, CMOV, SSE2, SSE4.1 TEXT ·indexPair32(SB), NOSPLIT, $0-32 MOVQ b_base+0(FP), AX MOVQ b_len+8(FP), CX MOVQ AX, DX CMPQ CX, $0x00 JLE fail SUBQ $0x20, CX BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 tail: CMPQ CX, $0x00 JE fail generic: MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU 32(AX), X2 MOVOU 48(AX), X3 PCMPEQQ X0, X2 PCMPEQQ X1, X3 PMOVMSKB X2, BX PMOVMSKB X3, SI ANDL SI, BX CMPL BX, $0x0000ffff JE done ADDQ $0x20, AX SUBQ $0x20, CX CMPQ CX, $0x00 JA generic fail: MOVQ $0xffffffffffffffff, AX MOVQ AX, ret+24(FP) RET done: SUBQ DX, AX MOVQ AX, ret+24(FP) RET avx2: CMPQ CX, $0x00000120 JB avx2_tail128 XORQ BX, BX XORQ SI, SI XORQ DI, DI XORQ R8, R8 XORQ R9, R9 XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 avx2_loop256: XORQ R13, R13 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y5 VMOVDQU 128(AX), Y7 VMOVDQU 160(AX), Y8 VMOVDQU 192(AX), Y9 VMOVDQU 224(AX), Y10 VMOVDQU 256(AX), Y11 VPCMPEQQ Y0, Y2, Y1 VPCMPEQQ Y2, Y4, Y3 VPCMPEQQ Y4, Y5, Y0 VPCMPEQQ Y5, Y7, Y6 VPCMPEQQ Y7, Y8, Y2 VPCMPEQQ Y8, Y9, Y4 VPCMPEQQ Y9, Y10, Y5 VPCMPEQQ Y10, Y11, Y11 VPMOVMSKB Y1, BX CMPL BX, $0xffffffff CMOVLNE R13, BX VPMOVMSKB Y3, SI CMPL SI, $0xffffffff CMOVLNE R13, SI VPMOVMSKB Y0, DI CMPL DI, $0xffffffff CMOVLNE R13, DI VPMOVMSKB Y6, R8 CMPL R8, $0xffffffff CMOVLNE R13, R8 VPMOVMSKB Y2, R9 CMPL R9, $0xffffffff CMOVLNE R13, R9 VPMOVMSKB Y4, R10 CMPL R10, $0xffffffff CMOVLNE R13, R10 VPMOVMSKB Y5, R11 CMPL R11, $0xffffffff CMOVLNE R13, R11 VPMOVMSKB Y11, R12 CMPL R12, $0xffffffff CMOVLNE R13, R12 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 ORQ R9, R13 ORQ R10, R13 ORQ R11, R13 ORQ R12, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000100, AX SUBQ $0x00000100, CX CMPQ CX, $0x00000120 JAE avx2_loop256 avx2_tail128: CMPQ CX, $0xa0 JB avx2_tail64 XORQ R13, R13 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y4 VMOVDQU 96(AX), Y5 VMOVDQU 128(AX), Y6 VPCMPEQQ Y0, Y2, Y1 VPCMPEQQ Y2, Y4, Y3 VPCMPEQQ Y4, Y5, Y0 VPCMPEQQ Y5, Y6, Y6 VPMOVMSKB Y1, BX CMPL BX, $0xffffffff CMOVLNE R13, BX VPMOVMSKB Y3, SI CMPL SI, $0xffffffff CMOVLNE R13, SI VPMOVMSKB Y0, DI CMPL DI, $0xffffffff CMOVLNE R13, DI VPMOVMSKB Y6, R8 CMPL R8, $0xffffffff CMOVLNE R13, R8 XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 ORQ DI, R13 ORQ R8, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000080, AX SUBQ $0x00000080, CX avx2_tail64: CMPQ CX, $0x60 JB avx2_tail32 XORQ R13, R13 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y2 VMOVDQU 64(AX), Y3 VPCMPEQQ Y0, Y2, Y1 VPCMPEQQ Y2, Y3, Y3 VPMOVMSKB Y1, BX CMPL BX, $0xffffffff CMOVLNE R13, BX VPMOVMSKB Y3, SI CMPL SI, $0xffffffff CMOVLNE R13, SI XORQ R13, R13 ORQ BX, R13 ORQ SI, R13 CMPQ R13, $0x00 JNE avx2_done ADDQ $0x00000040, AX SUBQ $0x00000040, CX avx2_tail32: CMPQ CX, $0x40 JB avx2_tail16 XORQ R13, R13 VMOVDQU (AX), Y0 VMOVDQU 32(AX), Y1 VPCMPEQQ Y0, Y1, Y1 VPMOVMSKB Y1, BX CMPL BX, $0xffffffff CMOVLNE R13, BX CMPQ BX, $0x00 JNE avx2_done ADDQ $0x00000020, AX SUBQ $0x00000020, CX avx2_tail16: VZEROUPPER JMP tail avx2_done: VZEROUPPER CMPQ BX, $0x00 JNE avx2_done0 CMPQ SI, $0x00 JNE avx2_done1 CMPQ DI, $0x00 JNE avx2_done2 CMPQ R8, $0x00 JNE avx2_done3 CMPQ R9, $0x00 JNE avx2_done4 CMPQ R10, $0x00 JNE avx2_done5 CMPQ R11, $0x00 JNE avx2_done6 CMPQ R12, $0x00 JNE avx2_done7 avx2_done0: TZCNTQ BX, BX ADDQ BX, AX SUBQ BX, CX JMP done avx2_done1: ADDQ $0x00000020, AX SUBQ $0x00000020, CX TZCNTQ SI, SI ADDQ SI, AX SUBQ SI, CX JMP done avx2_done2: ADDQ $0x00000040, AX SUBQ $0x00000040, CX TZCNTQ DI, DI ADDQ DI, AX SUBQ DI, CX JMP done avx2_done3: ADDQ $0x00000060, AX SUBQ $0x00000060, CX TZCNTQ R8, R8 ADDQ R8, AX SUBQ R8, CX JMP done avx2_done4: ADDQ $0x00000080, AX SUBQ $0x00000080, CX TZCNTQ R9, R9 ADDQ R9, AX SUBQ R9, CX JMP done avx2_done5: ADDQ $0x000000a0, AX SUBQ $0x000000a0, CX TZCNTQ R10, R10 ADDQ R10, AX SUBQ R10, CX JMP done avx2_done6: ADDQ $0x000000c0, AX SUBQ $0x000000c0, CX TZCNTQ R11, R11 ADDQ R11, AX SUBQ R11, CX JMP done avx2_done7: ADDQ $0x000000e0, AX SUBQ $0x000000e0, CX TZCNTQ R12, R12 ADDQ R12, AX SUBQ R12, CX JMP done golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/index_pair_default.go000066400000000000000000000007211452252572700265630ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package mem func indexPair1(b []byte) int { return indexPairGeneric(b, 1) } func indexPair2(b []byte) int { return indexPairGeneric(b, 2) } func indexPair4(b []byte) int { return indexPairGeneric(b, 4) } func indexPair8(b []byte) int { return indexPairGeneric(b, 8) } func indexPair16(b []byte) int { return indexPairGeneric(b, 16) } func indexPair32(b []byte) int { return indexPairGeneric(b, 32) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/index_pair_test.go000066400000000000000000000075771452252572700261360ustar00rootroot00000000000000package mem import ( "fmt" "testing" ) var ( indexPairSizes = [...]int{ 1, 2, 4, 8, 10, 16, 32, } ) func TestIndexPair(t *testing.T) { for _, size := range indexPairSizes { makeInput := func(values ...byte) []byte { input := make([]byte, size*len(values)) for i := range values { input[i*size] = values[i] } return input } t.Run(fmt.Sprintf("N=%d", size), func(t *testing.T) { tests := []struct { scenario string input []byte index int }{ { scenario: "empty input", input: nil, index: -1, }, { scenario: "input with only one item", input: makeInput(1), index: -1, }, { scenario: "input with two non-equal items", input: makeInput(1, 2), index: -1, }, { scenario: "input with two equal items", input: makeInput(1, 1), index: 0, }, { scenario: "input with two equal items in the middle", input: makeInput(0, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9), index: 5, }, { scenario: "input with two equal items at the end", input: makeInput(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9), index: 9, }, { scenario: "input with two equal items at the beginning of a long sequence", input: makeInput( 0, 0, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, ), index: 0, }, { scenario: "input with two equal items in the middle of a long sequence", input: makeInput( 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 7, // pair 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, ), index: 61, }, { scenario: "input with two equal items at the end of a long sequence", input: makeInput( 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 7, // pair ), index: 142, }, } for _, test := range tests { t.Run(test.scenario, func(t *testing.T) { i := test.index * size j := IndexPair(test.input, size) if i < 0 { i = -1 } if i != j { t.Errorf("expected=%d found=%d", i, j) } }) } }) } } func BenchmarkIndexPair(b *testing.B) { for _, size := range indexPairSizes { input := make([]byte, 16*1024) for i := range input { input[i] = byte(i) } if size%len(input) != 0 { input = input[:(len(input)/size)*size] } b.Run(fmt.Sprintf("N=%d", size), func(b *testing.B) { b.SetBytes(int64(len(input))) for i := 0; i < b.N; i++ { n := IndexPair(input, size) if n != -1 { b.Fatal("unexpected result:", n) } } }) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/mask.go000066400000000000000000000003471452252572700236740ustar00rootroot00000000000000package mem func maskGeneric(dst, src []byte) int { switch { case len(dst) < len(src): src = src[:len(dst)] case len(dst) > len(src): dst = dst[:len(src)] } for i := range dst { dst[i] &= src[i] } return len(dst) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/mask_amd64.go000066400000000000000000000004631452252572700246660ustar00rootroot00000000000000// Code generated by command: go run mask_asm.go -pkg mem -out ../mem/mask_amd64.s -stubs ../mem/mask_amd64.go. DO NOT EDIT. //go:build !purego package mem // Mask set bits of dst to zero and copies the one-bits of src to dst, returning the number of bytes written. func Mask(dst []byte, src []byte) int golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/mask_amd64.s000066400000000000000000000065771452252572700245370ustar00rootroot00000000000000// Code generated by command: go run mask_asm.go -pkg mem -out ../mem/mask_amd64.s -stubs ../mem/mask_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func Mask(dst []byte, src []byte) int // Requires: AVX, AVX2, CMOV, SSE2 TEXT ·Mask(SB), NOSPLIT, $0-56 MOVQ dst_base+0(FP), AX MOVQ src_base+24(FP), CX MOVQ dst_len+8(FP), DX MOVQ src_len+32(FP), BX CMPQ BX, DX CMOVQLT BX, DX MOVQ DX, ret+48(FP) tail: CMPQ DX, $0x00 JE done CMPQ DX, $0x01 JE handle1 CMPQ DX, $0x03 JBE handle2to3 CMPQ DX, $0x04 JE handle4 CMPQ DX, $0x08 JB handle5to7 JE handle8 CMPQ DX, $0x10 JBE handle9to16 CMPQ DX, $0x20 JBE handle17to32 CMPQ DX, $0x40 JBE handle33to64 BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCC generic CMPQ DX, $0x00000080 JB avx2_tail JMP avx2 generic: MOVOU (CX), X0 MOVOU (AX), X1 MOVOU 16(CX), X2 MOVOU 16(AX), X3 MOVOU 32(CX), X4 MOVOU 32(AX), X5 MOVOU 48(CX), X6 MOVOU 48(AX), X7 PAND X1, X0 PAND X3, X2 PAND X5, X4 PAND X7, X6 MOVOU X0, (AX) MOVOU X2, 16(AX) MOVOU X4, 32(AX) MOVOU X6, 48(AX) ADDQ $0x40, CX ADDQ $0x40, AX SUBQ $0x40, DX CMPQ DX, $0x40 JBE tail JMP generic done: RET handle1: MOVB (CX), CL MOVB (AX), DL ANDB DL, CL MOVB CL, (AX) RET handle2to3: MOVW (CX), BX MOVW (AX), SI MOVW -2(CX)(DX*1), CX MOVW -2(AX)(DX*1), DI ANDW SI, BX ANDW DI, CX MOVW BX, (AX) MOVW CX, -2(AX)(DX*1) RET handle4: MOVL (CX), CX MOVL (AX), DX ANDL DX, CX MOVL CX, (AX) RET handle5to7: MOVL (CX), BX MOVL (AX), SI MOVL -4(CX)(DX*1), CX MOVL -4(AX)(DX*1), DI ANDL SI, BX ANDL DI, CX MOVL BX, (AX) MOVL CX, -4(AX)(DX*1) RET handle8: MOVQ (CX), CX MOVQ (AX), DX ANDQ DX, CX MOVQ CX, (AX) RET handle9to16: MOVQ (CX), BX MOVQ (AX), SI MOVQ -8(CX)(DX*1), CX MOVQ -8(AX)(DX*1), DI ANDQ SI, BX ANDQ DI, CX MOVQ BX, (AX) MOVQ CX, -8(AX)(DX*1) RET handle17to32: MOVOU (CX), X0 MOVOU (AX), X1 MOVOU -16(CX)(DX*1), X2 MOVOU -16(AX)(DX*1), X3 PAND X1, X0 PAND X3, X2 MOVOU X0, (AX) MOVOU X2, -16(AX)(DX*1) RET handle33to64: MOVOU (CX), X0 MOVOU (AX), X1 MOVOU 16(CX), X2 MOVOU 16(AX), X3 MOVOU -32(CX)(DX*1), X4 MOVOU -32(AX)(DX*1), X5 MOVOU -16(CX)(DX*1), X6 MOVOU -16(AX)(DX*1), X7 PAND X1, X0 PAND X3, X2 PAND X5, X4 PAND X7, X6 MOVOU X0, (AX) MOVOU X2, 16(AX) MOVOU X4, -32(AX)(DX*1) MOVOU X6, -16(AX)(DX*1) RET // AVX optimized version for medium to large size inputs. avx2: VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VPAND (AX), Y0, Y0 VPAND 32(AX), Y1, Y1 VPAND 64(AX), Y2, Y2 VPAND 96(AX), Y3, Y3 VMOVDQU Y0, (AX) VMOVDQU Y1, 32(AX) VMOVDQU Y2, 64(AX) VMOVDQU Y3, 96(AX) ADDQ $0x00000080, CX ADDQ $0x00000080, AX SUBQ $0x00000080, DX JZ avx2_done CMPQ DX, $0x00000080 JAE avx2 avx2_tail: CMPQ DX, $0x40 JBE avx2_tail_1to64 VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU -64(CX)(DX*1), Y2 VMOVDQU -32(CX)(DX*1), Y3 VPAND (AX), Y0, Y0 VPAND 32(AX), Y1, Y1 VPAND -64(AX)(DX*1), Y2, Y2 VPAND -32(AX)(DX*1), Y3, Y3 VMOVDQU Y0, (AX) VMOVDQU Y1, 32(AX) VMOVDQU Y2, -64(AX)(DX*1) VMOVDQU Y3, -32(AX)(DX*1) JMP avx2_done avx2_tail_1to64: VMOVDQU -64(CX)(DX*1), Y0 VMOVDQU -32(CX)(DX*1), Y1 VPAND -64(AX)(DX*1), Y0, Y0 VPAND -32(AX)(DX*1), Y1, Y1 VMOVDQU Y0, -64(AX)(DX*1) VMOVDQU Y1, -32(AX)(DX*1) avx2_done: VZEROUPPER RET golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/mask_default.go000066400000000000000000000003441452252572700253750ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package mem // Mask performs a AND of src and dst into dst, returning the number of bytes // written to dst. func Mask(dst, src []byte) int { return maskGeneric(dst, src) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/mask_test.go000066400000000000000000000002351452252572700247270ustar00rootroot00000000000000package mem import "testing" func TestMask(t *testing.T) { testCopy(t, Mask, maskGeneric) } func BenchmarkMask(b *testing.B) { benchmarkCopy(b, Mask) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/mem.go000066400000000000000000000000651452252572700235140ustar00rootroot00000000000000package mem import _ "github.com/segmentio/asm/cpu" golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/mem/mem_test.go000066400000000000000000000024371452252572700245600ustar00rootroot00000000000000package mem import ( "bytes" "encoding/hex" "fmt" "io" "math/rand" "testing" ) var ( testSizes = [...]int{ 0, 1, 2, 3, 4, 6, 8, 10, 31, 32, 33, 64, 100, 1024, 4096, } benchmarkSizes = [...]int{ 7, 10, 31, 32, 100, 1024, 4096, } ) func testCopy(t *testing.T, test, init func(dst, src []byte) int) { for _, N := range testSizes { t.Run(fmt.Sprintf("N=%d", N), func(t *testing.T) { src := make([]byte, N) dst := make([]byte, N) exp := make([]byte, N) prng := rand.New(rand.NewSource(0)) io.ReadFull(prng, src) io.ReadFull(prng, dst) copy(exp, dst) init(exp, src) n := test(dst, src) if n != N { t.Errorf("copying did not apply to enough bytes: %d != %d", n, N) } if !bytes.Equal(dst, exp) { t.Error("copying produced the wrong output") t.Logf("expected:\n%s", hex.Dump(exp)) t.Logf("found: \n%s", hex.Dump(dst)) t.Logf("source: \n%s", hex.Dump(src)) } }) } } func benchmarkCopy(b *testing.B, test func(dst, src []byte) int) { for _, N := range benchmarkSizes { b.Run(fmt.Sprintf("N=%d", N), func(b *testing.B) { dst := make([]byte, N) src := make([]byte, N) io.ReadFull(rand.New(rand.NewSource(0)), src) b.SetBytes(int64(N)) b.ResetTimer() for i := 0; i < b.N; i++ { test(dst, src) } }) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/000077500000000000000000000000001452252572700230005ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/generic.go000066400000000000000000000012411452252572700247410ustar00rootroot00000000000000package qsort import "bytes" type generic struct { data []byte size int temp []byte swap func(int, int) } func newGeneric(data []byte, size int, swap func(int, int)) *generic { return &generic{ data: data, size: size, temp: make([]byte, size), swap: swap, } } func (g *generic) Len() int { return len(g.data) / g.size } func (g *generic) Less(i, j int) bool { return bytes.Compare(g.slice(i), g.slice(j)) < 0 } func (g *generic) Swap(i, j int) { copy(g.temp, g.slice(j)) copy(g.slice(j), g.slice(i)) copy(g.slice(i), g.temp) if g.swap != nil { g.swap(i, j) } } func (g *generic) slice(i int) []byte { return g.data[i*g.size : (i+1)*g.size] } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/sort.go000066400000000000000000000070521452252572700243220ustar00rootroot00000000000000package qsort import ( "sort" "github.com/segmentio/asm/bswap" "github.com/segmentio/asm/cpu" "github.com/segmentio/asm/cpu/x86" "github.com/segmentio/asm/internal" ) // Sort sorts contiguous big-endian chunks of bytes of a fixed size. // Sorting specializations are available for sizes of 8, 16, 24 and 32 bytes. func Sort(data []byte, size int, swap func(int, int)) { if len(data) <= size { return } if size <= 0 || !internal.MultipleOf(size, len(data)) { panic("input length is not a multiple of element size") } // No specialization available. Use the slower generic sorting routine. if size%8 != 0 || size > 32 { sort.Sort(newGeneric(data, size, swap)) return } // Byte swap each qword prior to sorting. Doing a single pass here, and // again after the sort, is faster than byte swapping during each // comparison. The sorting routines have been written to assume that high // qwords come before low qwords, and so we're able to use the same // Swap64() routine rather than needing separate byte swapping routines // for 8, 16, 24, or 32 bytes. bswap.Swap64(data) defer bswap.Swap64(data) // If no indirect swapping is required, try to use the hybrid partitioning scheme from // https://blog.reverberate.org/2020/05/29/hoares-rebuttal-bubble-sorts-comeback.html switch { case swap == nil && !purego && size == 8 && cpu.X86.Has(x86.CMOV): hybridQuicksort64(unsafeBytesTo64(data)) case swap == nil && !purego && size == 16 && cpu.X86.Has(x86.AVX): hybridQuicksort128(unsafeBytesTo128(data)) case swap == nil && !purego && size == 32 && cpu.X86.Has(x86.AVX2): hybridQuicksort256(unsafeBytesTo256(data)) case size == 8: quicksort64(unsafeBytesTo64(data), 0, smallCutoff, insertionsort64, hoarePartition64, swap) case size == 16: quicksort128(unsafeBytesTo128(data), 0, smallCutoff, insertionsort128, hoarePartition128, swap) case size == 24: quicksort192(unsafeBytesTo192(data), 0, smallCutoff, insertionsort192, hoarePartition192, swap) case size == 32: quicksort256(unsafeBytesTo256(data), 0, smallCutoff, insertionsort256, hoarePartition256, swap) } } func hybridQuicksort64(data []uint64) { // The hybrid Lomuto/Hoare partition scheme requires scratch space. We // allocate some stack space for the task here in this trampoline function, // so that we don't pay the stack cost unless necessary. var buf [scratchSize]byte scratch := unsafeBytesTo64(buf[:]) partition := func(data []uint64, base int, swap func(int, int)) int { return hybridPartition64(data, scratch) } quicksort64(data, 0, smallCutoff/2, bubblesort64NoSwap2, partition, nil) } func hybridQuicksort128(data []uint128) { var buf [scratchSize]byte scratch := unsafeBytesTo128(buf[:]) partition := func(data []uint128, base int, swap func(int, int)) int { return hybridPartition128(data, scratch) } quicksort128(data, 0, smallCutoff*2, insertionsort128NoSwap, partition, nil) } func hybridQuicksort256(data []uint256) { var buf [scratchSize]byte scratch := unsafeBytesTo256(buf[:]) partition := func(data []uint256, base int, swap func(int, int)) int { return hybridPartition256(data, scratch) } quicksort256(data, 0, smallCutoff*2, insertionsort256NoSwap, partition, nil) } // The threshold at which log-linear sorting methods switch to // a quadratic (but cache-friendly) method such as insertionsort. const smallCutoff = 256 // The amount of stack space to allocate as scratch space when // using the hybrid Lomuto/Hoare partition scheme. const scratchSize = 1024 func callswap(base int, swap func(int, int), i, j int) { if swap != nil { swap(base+i, base+j) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/sort16.go000066400000000000000000000056561452252572700245010ustar00rootroot00000000000000package qsort // A type alias is used here so that we can pass values to an avo // generated assembly routine. At present, we can't figure out how // to use the uint128 type in function signatures there. A type // alias allows us to use struct { a, b uint64 } in the signature, // and implicit conversions only work if you're using a type alias. type uint128 = struct { hi uint64 lo uint64 } type smallsort128 func(data []uint128, base int, swap func(int, int)) type partition128 func(data []uint128, base int, swap func(int, int)) int func quicksort128(data []uint128, base, cutoff int, smallsort smallsort128, partition partition128, swap func(int, int)) { for len(data) > 1 { if len(data) <= cutoff/16 { smallsort(data, base, swap) return } medianOfThree128(data, base, swap) p := partition(data, base, swap) if p < len(data)-p { // recurse on the smaller side quicksort128(data[:p], base, cutoff, smallsort, partition, swap) data = data[p+1:] base = base + p + 1 } else { quicksort128(data[p+1:], base+p+1, cutoff, smallsort, partition, swap) data = data[:p] } } } func insertionsort128(data []uint128, base int, swap func(int, int)) { for i := 1; i < len(data); i++ { item := data[i] for j := i; j > 0 && less128(item, data[j-1]); j-- { data[j], data[j-1] = data[j-1], data[j] callswap(base, swap, j, j-1) } } } func medianOfThree128(data []uint128, base int, swap func(int, int)) { end := len(data) - 1 mid := len(data) / 2 if less128(data[0], data[mid]) { data[mid], data[0] = data[0], data[mid] callswap(base, swap, mid, 0) } if less128(data[end], data[0]) { data[0], data[end] = data[end], data[0] callswap(base, swap, 0, end) if less128(data[0], data[mid]) { data[mid], data[0] = data[0], data[mid] callswap(base, swap, mid, 0) } } } func hoarePartition128(data []uint128, base int, swap func(int, int)) int { i, j := 1, len(data)-1 if len(data) > 0 { pivot := data[0] for j < len(data) { for i < len(data) && less128(data[i], pivot) { i++ } for j > 0 && less128(pivot, data[j]) { j-- } if i >= j { break } data[i], data[j] = data[j], data[i] callswap(base, swap, i, j) i++ j-- } data[0], data[j] = data[j], data[0] callswap(base, swap, 0, j) } return j } func hybridPartition128(data, scratch []uint128) int { pivot, lo, hi, limit := 0, 1, len(data)-1, len(scratch) p := distributeForward128(data, scratch, limit, lo, hi) if hi-p <= limit { scratch = scratch[limit-hi+p:] } else { lo = p + limit for { hi = distributeBackward128(data, data[lo+1-limit:], limit, lo, hi) - limit if hi < lo { p = hi break } lo = distributeForward128(data, data[hi+1:], limit, lo, hi) + limit if hi < lo { p = lo - limit break } } } copy(data[p+1:], scratch[:]) data[pivot], data[p] = data[p], data[pivot] return p } func less128(a, b uint128) bool { return a.hi < b.hi || (a.hi == b.hi && a.lo < b.lo) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/sort24.go000066400000000000000000000041261452252572700244670ustar00rootroot00000000000000package qsort type uint192 = struct { hi uint64 mid uint64 lo uint64 } type smallsort192 func(data []uint192, base int, swap func(int, int)) type partition192 func(data []uint192, base int, swap func(int, int)) int func quicksort192(data []uint192, base, cutoff int, smallsort smallsort192, partition partition192, swap func(int, int)) { for len(data) > 1 { if len(data) <= cutoff/24 { smallsort(data, base, swap) return } medianOfThree192(data, base, swap) p := partition(data, base, swap) if p < len(data)-p { // recurse on the smaller side quicksort192(data[:p], base, cutoff, smallsort, partition, swap) data = data[p+1:] base = base + p + 1 } else { quicksort192(data[p+1:], base+p+1, cutoff, smallsort, partition, swap) data = data[:p] } } } func insertionsort192(data []uint192, base int, swap func(int, int)) { for i := 1; i < len(data); i++ { item := data[i] for j := i; j > 0 && less192(item, data[j-1]); j-- { data[j], data[j-1] = data[j-1], data[j] callswap(base, swap, j, j-1) } } } func medianOfThree192(data []uint192, base int, swap func(int, int)) { end := len(data) - 1 mid := len(data) / 2 if less192(data[0], data[mid]) { data[mid], data[0] = data[0], data[mid] callswap(base, swap, mid, 0) } if less192(data[end], data[0]) { data[0], data[end] = data[end], data[0] callswap(base, swap, 0, end) if less192(data[0], data[mid]) { data[mid], data[0] = data[0], data[mid] callswap(base, swap, mid, 0) } } } func hoarePartition192(data []uint192, base int, swap func(int, int)) int { i, j := 1, len(data)-1 if len(data) > 0 { pivot := data[0] for j < len(data) { for i < len(data) && less192(data[i], pivot) { i++ } for j > 0 && less192(pivot, data[j]) { j-- } if i >= j { break } data[i], data[j] = data[j], data[i] callswap(base, swap, i, j) i++ j-- } data[0], data[j] = data[j], data[0] callswap(base, swap, 0, j) } return j } func less192(a, b uint192) bool { return a.hi < b.hi || (a.hi == b.hi && a.mid < b.mid) || (a.hi == b.hi && a.mid == b.mid && a.lo < b.lo) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/sort32.go000066400000000000000000000053441452252572700244710ustar00rootroot00000000000000package qsort type uint256 = struct { a uint64 // hi b uint64 c uint64 d uint64 // lo } type smallsort256 func(data []uint256, base int, swap func(int, int)) type partition256 func(data []uint256, base int, swap func(int, int)) int func quicksort256(data []uint256, base, cutoff int, smallsort smallsort256, partition partition256, swap func(int, int)) { for len(data) > 1 { if len(data) <= cutoff/32 { smallsort(data, base, swap) return } medianOfThree256(data, base, swap) p := partition(data, base, swap) if p < len(data)-p { // recurse on the smaller side quicksort256(data[:p], base, cutoff, smallsort, partition, swap) data = data[p+1:] base = base + p + 1 } else { quicksort256(data[p+1:], base+p+1, cutoff, smallsort, partition, swap) data = data[:p] } } } func insertionsort256(data []uint256, base int, swap func(int, int)) { for i := 1; i < len(data); i++ { item := data[i] for j := i; j > 0 && less256(item, data[j-1]); j-- { data[j], data[j-1] = data[j-1], data[j] callswap(base, swap, j, j-1) } } } func medianOfThree256(data []uint256, base int, swap func(int, int)) { end := len(data) - 1 mid := len(data) / 2 if less256(data[0], data[mid]) { data[mid], data[0] = data[0], data[mid] callswap(base, swap, mid, 0) } if less256(data[end], data[0]) { data[0], data[end] = data[end], data[0] callswap(base, swap, 0, end) if less256(data[0], data[mid]) { data[mid], data[0] = data[0], data[mid] callswap(base, swap, mid, 0) } } } func hoarePartition256(data []uint256, base int, swap func(int, int)) int { i, j := 1, len(data)-1 if len(data) > 0 { pivot := data[0] for j < len(data) { for i < len(data) && less256(data[i], pivot) { i++ } for j > 0 && less256(pivot, data[j]) { j-- } if i >= j { break } data[i], data[j] = data[j], data[i] callswap(base, swap, i, j) i++ j-- } data[0], data[j] = data[j], data[0] callswap(base, swap, 0, j) } return j } func hybridPartition256(data, scratch []uint256) int { pivot, lo, hi, limit := 0, 1, len(data)-1, len(scratch) p := distributeForward256(data, scratch, limit, lo, hi) if hi-p <= limit { scratch = scratch[limit-hi+p:] } else { lo = p + limit for { hi = distributeBackward256(data, data[lo+1-limit:], limit, lo, hi) - limit if hi < lo { p = hi break } lo = distributeForward256(data, data[hi+1:], limit, lo, hi) + limit if hi < lo { p = lo - limit break } } } copy(data[p+1:], scratch[:]) data[pivot], data[p] = data[p], data[pivot] return p } func less256(a, b uint256) bool { return a.a < b.a || (a.a == b.a && a.b < b.b) || (a.a == b.a && a.b == b.b && a.c < b.c) || (a.a == b.a && a.b == b.b && a.c == b.c && a.d < b.d) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/sort8.go000066400000000000000000000062671452252572700244210ustar00rootroot00000000000000package qsort type smallsort64 func(data []uint64, base int, swap func(int, int)) type partition64 func(data []uint64, base int, swap func(int, int)) int func quicksort64(data []uint64, base, cutoff int, smallsort smallsort64, partition partition64, swap func(int, int)) { for len(data) > 1 { if len(data) <= cutoff/8 { smallsort(data, base, swap) return } medianOfThree64(data, base, swap) p := partition(data, base, swap) if p < len(data)-p { // recurse on the smaller side quicksort64(data[:p], base, cutoff, smallsort, partition, swap) data = data[p+1:] base = base + p + 1 } else { quicksort64(data[p+1:], base+p+1, cutoff, smallsort, partition, swap) data = data[:p] } } } func bubblesort64NoSwap1(data []uint64, base int, swap func(int, int)) { for i := len(data); i > 1; i-- { max := data[0] for j := 1; j < i; j++ { y := data[j] x := uint64(0) if max <= y { x = max } else { x = y } if max <= y { max = y } data[j-1] = x } data[i-1] = max } } func bubblesort64NoSwap2(data []uint64, base int, swap func(int, int)) { for i := len(data); i > 1; i -= 2 { x := data[0] y := data[1] if y < x { x, y = y, x } for j := 2; j < i; j++ { z := data[j] w := uint64(0) v := uint64(0) if y <= z { w = y } else { w = z } if y <= z { y = z } if x <= z { v = x } else { v = z } if x <= z { x = w } data[j-2] = v } data[i-2] = x data[i-1] = y } } func insertionsort64(data []uint64, base int, swap func(int, int)) { for i := 1; i < len(data); i++ { item := data[i] for j := i; j > 0 && item < data[j-1]; j-- { data[j], data[j-1] = data[j-1], data[j] callswap(base, swap, j, j-1) } } } func medianOfThree64(data []uint64, base int, swap func(int, int)) { end := len(data) - 1 mid := len(data) / 2 if data[0] < data[mid] { data[mid], data[0] = data[0], data[mid] callswap(base, swap, mid, 0) } if data[end] < data[0] { data[0], data[end] = data[end], data[0] callswap(base, swap, 0, end) if data[0] < data[mid] { data[mid], data[0] = data[0], data[mid] callswap(base, swap, mid, 0) } } } func hoarePartition64(data []uint64, base int, swap func(int, int)) int { i, j := 1, len(data)-1 if len(data) > 0 { pivot := data[0] for j < len(data) { for i < len(data) && data[i] < pivot { i++ } for j > 0 && pivot < data[j] { j-- } if i >= j { break } data[i], data[j] = data[j], data[i] callswap(base, swap, i, j) i++ j-- } data[0], data[j] = data[j], data[0] callswap(base, swap, 0, j) } return j } func hybridPartition64(data, scratch []uint64) int { pivot, lo, hi, limit := 0, 1, len(data)-1, len(scratch) p := distributeForward64(data, scratch, limit, lo, hi) if hi-p <= limit { scratch = scratch[limit-hi+p:] } else { lo = p + limit for { hi = distributeBackward64(data, data[lo+1-limit:], limit, lo, hi) - limit if hi < lo { p = hi break } lo = distributeForward64(data, data[hi+1:], limit, lo, hi) + limit if hi < lo { p = lo - limit break } } } copy(data[p+1:], scratch[:]) data[pivot], data[p] = data[p], data[pivot] return p } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/sort_amd64.go000066400000000000000000000024551452252572700253170ustar00rootroot00000000000000// Code generated by command: go run sort_asm.go -pkg qsort -out ../qsort/sort_amd64.s -stubs ../qsort/sort_amd64.go. DO NOT EDIT. //go:build !purego package qsort //go:noescape func distributeForward64(data []uint64, scratch []uint64, limit int, lo int, hi int) int //go:noescape func distributeBackward64(data []uint64, scratch []uint64, limit int, lo int, hi int) int //go:noescape func insertionsort128NoSwap(data []struct { hi uint64 lo uint64 }, base int, swap func(int, int)) //go:noescape func distributeForward128(data []struct { hi uint64 lo uint64 }, scratch []struct { hi uint64 lo uint64 }, limit int, lo int, hi int) int //go:noescape func distributeBackward128(data []struct { hi uint64 lo uint64 }, scratch []struct { hi uint64 lo uint64 }, limit int, lo int, hi int) int //go:noescape func insertionsort256NoSwap(data []struct { a uint64 b uint64 c uint64 d uint64 }, base int, swap func(int, int)) //go:noescape func distributeForward256(data []struct { a uint64 b uint64 c uint64 d uint64 }, scratch []struct { a uint64 b uint64 c uint64 d uint64 }, limit int, lo int, hi int) int //go:noescape func distributeBackward256(data []struct { a uint64 b uint64 c uint64 d uint64 }, scratch []struct { a uint64 b uint64 c uint64 d uint64 }, limit int, lo int, hi int) int golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/sort_amd64.s000066400000000000000000000167341452252572700251610ustar00rootroot00000000000000// Code generated by command: go run sort_asm.go -pkg qsort -out ../qsort/sort_amd64.s -stubs ../qsort/sort_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func distributeForward64(data []uint64, scratch []uint64, limit int, lo int, hi int) int // Requires: CMOV TEXT ·distributeForward64(SB), NOSPLIT, $0-80 MOVQ data_base+0(FP), AX MOVQ scratch_base+24(FP), CX MOVQ limit+48(FP), DX MOVQ lo+56(FP), BX MOVQ hi+64(FP), SI LEAQ (AX)(BX*8), BX LEAQ (AX)(SI*8), SI LEAQ -8(CX)(DX*8), CX MOVQ (AX), DI XORQ R8, R8 XORQ R9, R9 NEGQ DX loop: MOVQ (BX), R10 CMPQ R10, DI MOVQ BX, R11 CMOVQCC CX, R11 MOVQ R10, (R11)(R8*8) CMC SBBQ R9, R8 ADDQ $0x08, BX CMPQ BX, SI JA done CMPQ R8, DX JNE loop done: SUBQ AX, BX LEAQ (BX)(R8*8), BX SHRQ $0x03, BX DECQ BX MOVQ BX, ret+72(FP) RET // func distributeBackward64(data []uint64, scratch []uint64, limit int, lo int, hi int) int // Requires: CMOV TEXT ·distributeBackward64(SB), NOSPLIT, $0-80 MOVQ data_base+0(FP), AX MOVQ scratch_base+24(FP), CX MOVQ limit+48(FP), DX MOVQ lo+56(FP), BX MOVQ hi+64(FP), SI LEAQ (AX)(BX*8), BX LEAQ (AX)(SI*8), SI MOVQ (AX), DI XORQ R8, R8 XORQ R9, R9 CMPQ SI, BX JBE done loop: MOVQ (SI), R10 CMPQ R10, DI MOVQ CX, R11 CMOVQCC SI, R11 MOVQ R10, (R11)(R8*8) ADCQ R9, R8 SUBQ $0x08, SI CMPQ SI, BX JBE done CMPQ R8, DX JNE loop done: SUBQ AX, SI LEAQ (SI)(R8*8), SI SHRQ $0x03, SI MOVQ SI, ret+72(FP) RET // func insertionsort128NoSwap(data []struct{hi uint64; lo uint64}, base int, swap func(int, int)) // Requires: AVX TEXT ·insertionsort128NoSwap(SB), NOSPLIT, $0-40 MOVQ data_base+0(FP), AX MOVQ data_len+8(FP), CX SHLQ $0x04, CX ADDQ AX, CX TESTQ AX, CX JE done VPCMPEQB X0, X0, X0 VPSLLQ $0x3f, X0, X0 MOVQ AX, DX outer: ADDQ $0x10, DX CMPQ DX, CX JAE done VMOVDQU (DX), X1 MOVQ DX, BX inner: VMOVDQU -16(BX), X2 VPCMPEQQ X1, X2, X3 VPADDQ X1, X0, X4 VPADDQ X2, X0, X5 VPCMPGTQ X4, X5, X4 VMOVMSKPD X3, SI VMOVMSKPD X4, DI NOTL SI BSFL SI, R8 BTSL R8, DI JAE outer VMOVDQU X2, (BX) VMOVDQU X1, -16(BX) SUBQ $0x10, BX CMPQ BX, AX JA inner JMP outer done: RET // func distributeForward128(data []struct{hi uint64; lo uint64}, scratch []struct{hi uint64; lo uint64}, limit int, lo int, hi int) int // Requires: AVX, CMOV TEXT ·distributeForward128(SB), NOSPLIT, $0-80 MOVQ data_base+0(FP), AX MOVQ scratch_base+24(FP), CX MOVQ limit+48(FP), DX MOVQ lo+56(FP), BX MOVQ hi+64(FP), SI SHLQ $0x04, DX SHLQ $0x04, BX SHLQ $0x04, SI LEAQ (AX)(BX*1), BX LEAQ (AX)(SI*1), SI LEAQ -16(CX)(DX*1), CX VPCMPEQB X0, X0, X0 VPSLLQ $0x3f, X0, X0 VMOVDQU (AX), X1 XORQ R8, R8 XORQ R9, R9 NEGQ DX loop: VMOVDQU (BX), X2 VPCMPEQQ X2, X1, X3 VPADDQ X2, X0, X4 VPADDQ X1, X0, X5 VPCMPGTQ X4, X5, X4 VMOVMSKPD X3, R10 VMOVMSKPD X4, R11 NOTL R10 BSFL R10, DI BTSL DI, R11 MOVQ BX, R10 CMOVQCC CX, R10 VMOVDQU X2, (R10)(R8*1) SETCC R9 SHLQ $0x04, R9 SUBQ R9, R8 ADDQ $0x10, BX CMPQ BX, SI JA done CMPQ R8, DX JNE loop done: SUBQ AX, BX LEAQ (BX)(R8*1), BX SHRQ $0x04, BX DECQ BX MOVQ BX, ret+72(FP) RET // func distributeBackward128(data []struct{hi uint64; lo uint64}, scratch []struct{hi uint64; lo uint64}, limit int, lo int, hi int) int // Requires: AVX, CMOV TEXT ·distributeBackward128(SB), NOSPLIT, $0-80 MOVQ data_base+0(FP), AX MOVQ scratch_base+24(FP), CX MOVQ limit+48(FP), DX MOVQ lo+56(FP), BX MOVQ hi+64(FP), SI SHLQ $0x04, DX SHLQ $0x04, BX SHLQ $0x04, SI LEAQ (AX)(BX*1), BX LEAQ (AX)(SI*1), SI VPCMPEQB X0, X0, X0 VPSLLQ $0x3f, X0, X0 VMOVDQU (AX), X1 XORQ R8, R8 XORQ R9, R9 CMPQ SI, BX JBE done loop: VMOVDQU (SI), X2 VPCMPEQQ X2, X1, X3 VPADDQ X2, X0, X4 VPADDQ X1, X0, X5 VPCMPGTQ X4, X5, X4 VMOVMSKPD X3, R10 VMOVMSKPD X4, R11 NOTL R10 BSFL R10, DI BTSL DI, R11 MOVQ CX, R10 CMOVQCC SI, R10 VMOVDQU X2, (R10)(R8*1) SETCS R9 SHLQ $0x04, R9 ADDQ R9, R8 SUBQ $0x10, SI CMPQ SI, BX JBE done CMPQ R8, DX JNE loop done: SUBQ AX, SI LEAQ (SI)(R8*1), SI SHRQ $0x04, SI MOVQ SI, ret+72(FP) RET // func insertionsort256NoSwap(data []struct{a uint64; b uint64; c uint64; d uint64}, base int, swap func(int, int)) // Requires: AVX, AVX2 TEXT ·insertionsort256NoSwap(SB), NOSPLIT, $0-40 MOVQ data_base+0(FP), AX MOVQ data_len+8(FP), CX SHLQ $0x05, CX ADDQ AX, CX TESTQ AX, CX JE done VPCMPEQB Y0, Y0, Y0 VPSLLQ $0x3f, Y0, Y0 MOVQ AX, DX outer: ADDQ $0x20, DX CMPQ DX, CX JAE done VMOVDQU (DX), Y1 MOVQ DX, BX inner: VMOVDQU -32(BX), Y2 VPCMPEQQ Y1, Y2, Y3 VPADDQ Y1, Y0, Y4 VPADDQ Y2, Y0, Y5 VPCMPGTQ Y4, Y5, Y4 VMOVMSKPD Y3, SI VMOVMSKPD Y4, DI NOTL SI BSFL SI, R8 BTSL R8, DI JAE outer VMOVDQU Y2, (BX) VMOVDQU Y1, -32(BX) SUBQ $0x20, BX CMPQ BX, AX JA inner JMP outer done: VZEROUPPER RET // func distributeForward256(data []struct{a uint64; b uint64; c uint64; d uint64}, scratch []struct{a uint64; b uint64; c uint64; d uint64}, limit int, lo int, hi int) int // Requires: AVX, AVX2, CMOV TEXT ·distributeForward256(SB), NOSPLIT, $0-80 MOVQ data_base+0(FP), AX MOVQ scratch_base+24(FP), CX MOVQ limit+48(FP), DX MOVQ lo+56(FP), BX MOVQ hi+64(FP), SI SHLQ $0x05, DX SHLQ $0x05, BX SHLQ $0x05, SI LEAQ (AX)(BX*1), BX LEAQ (AX)(SI*1), SI LEAQ -32(CX)(DX*1), CX VPCMPEQB Y0, Y0, Y0 VPSLLQ $0x3f, Y0, Y0 VMOVDQU (AX), Y1 XORQ R8, R8 XORQ R9, R9 NEGQ DX loop: VMOVDQU (BX), Y2 VPCMPEQQ Y2, Y1, Y3 VPADDQ Y2, Y0, Y4 VPADDQ Y1, Y0, Y5 VPCMPGTQ Y4, Y5, Y4 VMOVMSKPD Y3, R10 VMOVMSKPD Y4, R11 NOTL R10 BSFL R10, DI BTSL DI, R11 MOVQ BX, R10 CMOVQCC CX, R10 VMOVDQU Y2, (R10)(R8*1) SETCC R9 SHLQ $0x05, R9 SUBQ R9, R8 ADDQ $0x20, BX CMPQ BX, SI JA done CMPQ R8, DX JNE loop done: SUBQ AX, BX LEAQ (BX)(R8*1), BX SHRQ $0x05, BX DECQ BX MOVQ BX, ret+72(FP) VZEROUPPER RET // func distributeBackward256(data []struct{a uint64; b uint64; c uint64; d uint64}, scratch []struct{a uint64; b uint64; c uint64; d uint64}, limit int, lo int, hi int) int // Requires: AVX, AVX2, CMOV TEXT ·distributeBackward256(SB), NOSPLIT, $0-80 MOVQ data_base+0(FP), AX MOVQ scratch_base+24(FP), CX MOVQ limit+48(FP), DX MOVQ lo+56(FP), BX MOVQ hi+64(FP), SI SHLQ $0x05, DX SHLQ $0x05, BX SHLQ $0x05, SI LEAQ (AX)(BX*1), BX LEAQ (AX)(SI*1), SI VPCMPEQB Y0, Y0, Y0 VPSLLQ $0x3f, Y0, Y0 VMOVDQU (AX), Y1 XORQ R8, R8 XORQ R9, R9 CMPQ SI, BX JBE done loop: VMOVDQU (SI), Y2 VPCMPEQQ Y2, Y1, Y3 VPADDQ Y2, Y0, Y4 VPADDQ Y1, Y0, Y5 VPCMPGTQ Y4, Y5, Y4 VMOVMSKPD Y3, R10 VMOVMSKPD Y4, R11 NOTL R10 BSFL R10, DI BTSL DI, R11 MOVQ CX, R10 CMOVQCC SI, R10 VMOVDQU Y2, (R10)(R8*1) SETCS R9 SHLQ $0x05, R9 ADDQ R9, R8 SUBQ $0x20, SI CMPQ SI, BX JBE done CMPQ R8, DX JNE loop done: SUBQ AX, SI LEAQ (SI)(R8*1), SI SHRQ $0x05, SI MOVQ SI, ret+72(FP) VZEROUPPER RET golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/sort_asm.go000066400000000000000000000001311452252572700251510ustar00rootroot00000000000000//go:build !purego && amd64 // +build !purego,amd64 package qsort const purego = false golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/sort_default.go000066400000000000000000000017031452252572700260230ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package qsort const purego = true func distributeForward64(data []uint64, scratch []uint64, limit int, lo int, hi int) int { panic("not implemented") } func distributeBackward64(data []uint64, scratch []uint64, limit int, lo int, hi int) int { panic("not implemented") } func insertionsort128NoSwap(data []uint128, base int, swap func(int, int)) { panic("not implemented") } func distributeForward128(data, scratch []uint128, limit, lo, hi int) int { panic("not implemented") } func distributeBackward128(data, scratch []uint128, limit, lo, hi int) int { panic("not implemented") } func insertionsort256NoSwap(data []uint256, base int, swap func(int, int)) { panic("not implemented") } func distributeForward256(data, scratch []uint256, limit, lo, hi int) int { panic("not implemented") } func distributeBackward256(data, scratch []uint256, limit, lo, hi int) int { panic("not implemented") } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/sort_test.go000066400000000000000000000204421452252572700253570ustar00rootroot00000000000000package qsort import ( "bytes" "math/rand" "reflect" "sort" "strconv" "testing" ) var prng = rand.New(rand.NewSource(0)) // Note, "8", "16", "32" etc are all byte measurements, not bits. So a 32 byte // integer, for example, which you might see in e.g. a SHA256 hash. func TestSort8(t *testing.T) { testSort(t, 8) } func TestSort16(t *testing.T) { testSort(t, 16) } func TestSort24(t *testing.T) { testSort(t, 24) } func TestSort32(t *testing.T) { testSort(t, 32) } func testSort(t *testing.T, size int) { const ( iterations = 1000 minCount = 0 maxCount = 1000 ) buf := make([]byte, maxCount*size) // A first test to validate that the swap function is called properly: prng.Read(buf) values := make([]byte, len(buf)) copy(values, buf) tmp := make([]byte, size) Sort(buf, size, func(i, j int) { vi := values[i*size : (i+1)*size] vj := values[j*size : (j+1)*size] copy(tmp, vi) copy(vi, vj) copy(vj, tmp) }) if !bytes.Equal(buf, values) { t.Fatal("values were not sorted correctly by the swap function") } for i := 0; i < iterations; i++ { count := randint(minCount, maxCount) slice := buf[:count*size] prng.Read(slice) // Test with/without duplicates. repeat := randint(0, count) for j := repeat; repeat > 0 && j < len(slice) && j+repeat < len(slice); j += repeat { copy(slice[j:j+repeat], slice[:repeat]) } expect := values[:len(slice)] copy(expect, slice) sort.Sort(newGeneric(expect, size, nil)) if !sort.IsSorted(newGeneric(expect, size, nil)) { t.Fatal("reference implementation did not produce a sorted output") } Sort(slice, size, nil) if !reflect.DeepEqual(expect, slice) { t.Fatal("buffer was not sorted correctly") } } } func TestPivot8(t *testing.T) { lo := uint64(1) mid := uint64(2) hi := uint64(3) for i := 0; i < 1000; i++ { input := []uint64{lo, mid, hi} rand.Shuffle(3, func(i, j int) { input[i], input[j] = input[j], input[i] }) medianOfThree64(input, 3, nil) if input[0] != mid { t.Fatal("medianOfThree128 did not put pivot in first position") } } } func TestPivot16(t *testing.T) { lo := uint128{lo: 1} mid := uint128{lo: 2} hi := uint128{lo: 3} for i := 0; i < 1000; i++ { input := []uint128{lo, mid, hi} rand.Shuffle(3, func(i, j int) { input[i], input[j] = input[j], input[i] }) medianOfThree128(input, 3, nil) if input[0] != mid { t.Fatal("medianOfThree128 did not put pivot in first position") } } } func TestPivot24(t *testing.T) { lo := uint192{lo: 1} mid := uint192{lo: 2} hi := uint192{lo: 3} for i := 0; i < 1000; i++ { input := []uint192{lo, mid, hi} rand.Shuffle(3, func(i, j int) { input[i], input[j] = input[j], input[i] }) medianOfThree192(input, 3, nil) if input[0] != mid { t.Fatal("medianOfThree192 did not put pivot in first position") } } } func TestPivot32(t *testing.T) { lo := uint256{d: 1} mid := uint256{d: 2} hi := uint256{d: 3} for i := 0; i < 1000; i++ { input := []uint256{lo, mid, hi} rand.Shuffle(3, func(i, j int) { input[i], input[j] = input[j], input[i] }) medianOfThree256(input, 3, nil) if input[0] != mid { t.Fatal("medianOfThree256 did not put pivot in first position") } } } func randint(lo, hi int) int { if hi == lo { return lo } return prng.Intn(hi-lo) + lo } func BenchmarkSort8(b *testing.B) { for _, count := range []int{1e3, 1e4, 1e5, 1e6} { b.Run("random-"+strconv.Itoa(count), benchSort(count, 8, 0, random, nil)) if count > 1e4 { b.Run("partially-ordered(10)-"+strconv.Itoa(count), benchSort(count, 8, 10, random, nil)) b.Run("partially-ordered(100)-"+strconv.Itoa(count), benchSort(count, 8, 100, random, nil)) b.Run("partially-ordered(1000)-"+strconv.Itoa(count), benchSort(count, 8, 1000, random, nil)) } } } func stdlibSort8(b *testing.B, size int) { // 8 bytes per int64 b.SetBytes(8 * int64(size)) data := make([]int64, size) unsorted := make([]int64, size) for j := 0; j < len(unsorted); j++ { unsorted[j] = int64(rand.Intn(size / 10)) } b.StopTimer() for i := 0; i < b.N; i++ { copy(data, unsorted) b.StartTimer() sort.Slice(data, func(i, j int) bool { return data[i] < data[j] }) b.StopTimer() } } func stdlibSort8PartiallySorted(b *testing.B, size int, partitions int) { // 8 bytes per int64 b.SetBytes(8 * int64(size)) data := make([]int64, size) // panic if not a whole number partitionSize := int(size / partitions) partitionOrder := rand.Perm(partitions) groupedPartitions := make([][]int64, partitions) for i := 0; i < len(groupedPartitions); i++ { partition := make([]int64, partitionSize) for j := 0; j < len(partition); j++ { partition[j] = int64(rand.Intn(size / 10)) } sort.Slice(partition, func(i, j int) bool { return partition[i] < partition[j] }) groupedPartitions[partitionOrder[i]] = partition } partiallyOrdered := make([]int64, size) for _, partition := range groupedPartitions { partiallyOrdered = append(partiallyOrdered, partition...) } b.StopTimer() for i := 0; i < b.N; i++ { copy(data, partiallyOrdered) b.StartTimer() sort.Slice(data, func(i, j int) bool { return data[i] < data[j] }) b.StopTimer() } } func BenchmarkStdlibSort8(b *testing.B) { for _, size := range []int{1e5, 1e6} { b.Run("random-"+strconv.Itoa(size), func(b *testing.B) { stdlibSort8(b, size) }) b.Run("partially-sorted(10)-"+strconv.Itoa(size), func(b *testing.B) { stdlibSort8PartiallySorted(b, size, 10) }) b.Run("partially-sorted(100)-"+strconv.Itoa(size), func(b *testing.B) { stdlibSort8PartiallySorted(b, size, 100) }) b.Run("partially-sorted(1000)-"+strconv.Itoa(size), func(b *testing.B) { stdlibSort8PartiallySorted(b, size, 1000) }) } } func BenchmarkSort8Indirect(b *testing.B) { swap := func(int, int) {} const count = 100000 b.Run("random", benchSort(count, 8, 0, random, swap)) b.Run("asc", benchSort(count, 8, 0, asc, swap)) b.Run("desc", benchSort(count, 8, 0, desc, swap)) } func BenchmarkSort16(b *testing.B) { for _, count := range []int{1e3, 1e4, 1e5, 1e6} { b.Run(strconv.Itoa(count), benchSort(count, 16, 0, random, nil)) } } func BenchmarkSort16Indirect(b *testing.B) { swap := func(int, int) {} const count = 100000 b.Run("random", benchSort(count, 16, 0, random, swap)) b.Run("asc", benchSort(count, 16, 0, asc, swap)) b.Run("desc", benchSort(count, 16, 0, desc, swap)) } func BenchmarkSort24(b *testing.B) { for _, count := range []int{1e3, 1e4, 1e5, 1e6} { b.Run(strconv.Itoa(count), benchSort(count, 24, 0, random, nil)) } } func BenchmarkSort24Indirect(b *testing.B) { swap := func(int, int) {} const count = 100000 b.Run("random", benchSort(count, 24, 0, random, swap)) b.Run("asc", benchSort(count, 24, 0, asc, swap)) b.Run("desc", benchSort(count, 24, 0, desc, swap)) } func BenchmarkSort32(b *testing.B) { for _, count := range []int{1e3, 1e4, 1e5, 1e6} { b.Run(strconv.Itoa(count), benchSort(count, 0, 32, random, nil)) } } func BenchmarkSort32Indirect(b *testing.B) { swap := func(int, int) {} const count = 100000 b.Run("random", benchSort(count, 32, 0, random, swap)) b.Run("asc", benchSort(count, 32, 0, asc, swap)) b.Run("desc", benchSort(count, 32, 0, desc, swap)) } type order int const ( random order = iota asc desc partiallyOrdered ) func benchSort(count, size, partitions int, order order, indirect func(int, int)) func(*testing.B) { return func(b *testing.B) { b.StopTimer() buf := make([]byte, count*size) unsorted := make([]byte, count*size) prng.Read(unsorted) if order == asc || order == desc { sort.Sort(newGeneric(unsorted, size, nil)) } if order == desc { g := newGeneric(unsorted, size, nil) items := g.Len() for i := 0; i < items/2; i++ { g.Swap(i, items-1-i) } } if order == partiallyOrdered { // panic if not a whole number partitionSize := int((count * size) / partitions) partitionOrder := rand.Perm(partitions) groupedPartitions := make([][]byte, partitions) for i := 0; i < len(groupedPartitions); i++ { partition := make([]byte, partitionSize) sort.Sort(newGeneric(partition, partitionSize, nil)) groupedPartitions[partitionOrder[i]] = partition } for _, partition := range groupedPartitions { unsorted = append(unsorted, partition...) } } b.SetBytes(int64(len(buf))) for i := 0; i < b.N; i++ { copy(buf, unsorted) b.StartTimer() Sort(buf, size, indirect) b.StopTimer() } } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/qsort/unsafe.go000066400000000000000000000012201452252572700246030ustar00rootroot00000000000000package qsort import "unsafe" func unsafeBytesTo64(b []byte) []uint64 { return *(*[]uint64)(unsafe.Pointer(cast(b, 8))) } func unsafeBytesTo128(b []byte) []uint128 { return *(*[]uint128)(unsafe.Pointer(cast(b, 16))) } func unsafeBytesTo192(b []byte) []uint192 { return *(*[]uint192)(unsafe.Pointer(cast(b, 24))) } func unsafeBytesTo256(b []byte) []uint256 { return *(*[]uint256)(unsafe.Pointer(cast(b, 32))) } func cast(b []byte, size int) *sliceHeader { return &sliceHeader{ Data: *(*unsafe.Pointer)(unsafe.Pointer(&b)), Len: len(b) / size, Cap: len(b) / size, } } type sliceHeader struct { Data unsafe.Pointer Len int Cap int } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/slices/000077500000000000000000000000001452252572700231125ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/slices/sums.go000066400000000000000000000036501452252572700244340ustar00rootroot00000000000000package slices import _ "github.com/segmentio/asm/cpu" // SumUint64 sums pairs of by index from x and y, similar to python's zip routine. // If available AVX instructions will be used to operate on many uint64s simultaneously. // // Results are returned in the x slice and y is left unaltered. If x and y differ in size // only len(x) elements will be processed. func SumUint64(x []uint64, y []uint64) { sumUint64(x, y) } func sumUint64Generic(x, y []uint64) { for i := 0; i < len(x) && i < len(y); i++ { x[i] = x[i] + y[i] } } // SumUint32 sums pairs of by index from x and y, similar to python's zip routine. // If available AVX instructions will be used to operate on many uint32s simultaneously. // // Results are returned in the x slice and y is left unaltered. If x and y differ in size // only len(x) elements will be processed. func SumUint32(x []uint32, y []uint32) { sumUint32(x, y) } func sumUint32Generic(x, y []uint32) { for i := 0; i < len(x) && i < len(y); i++ { x[i] = x[i] + y[i] } } // SumUint16 sums pairs of by index from x and y, similar to python's zip routine. // If available AVX instructions will be used to operate on many uint16s simultaneously. // // Results are returned in the x slice and y is left unaltered. If x and y differ in size // only len(x) elements will be processed. func SumUint16(x []uint16, y []uint16) { sumUint16(x, y) } func sumUint16Generic(x, y []uint16) { for i := 0; i < len(x) && i < len(y); i++ { x[i] = x[i] + y[i] } } // SumUint8 sums pairs of by index from x and y, similar to python's zip routine. // If available AVX instructions will be used to operate on many uint8s simultaneously. // // Results are returned in the x slice and y is left unaltered. If x and y differ in size // only len(x) elements will be processed. func SumUint8(x, y []uint8) { sumUint8(x, y) } func sumUint8Generic(x, y []uint8) { for i := 0; i < len(x) && i < len(y); i++ { x[i] = x[i] + y[i] } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/slices/sums_amd64.go000066400000000000000000000010661452252572700254260ustar00rootroot00000000000000// Code generated by command: go run sums_asm.go -pkg slices -out ../slices/sums_amd64.s -stubs ../slices/sums_amd64.go. DO NOT EDIT. //go:build !purego package slices // Sum uint64s using avx2 instructions, results stored in x func sumUint64(x []uint64, y []uint64) // Sum uint32s using avx2 instructions, results stored in x func sumUint32(x []uint32, y []uint32) // Sum uint16s using avx2 instructions, results stored in x func sumUint16(x []uint16, y []uint16) // Sum uint8s using avx2 instructions, results stored in x func sumUint8(x []uint8, y []uint8) golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/slices/sums_amd64.s000066400000000000000000000077151452252572700252720ustar00rootroot00000000000000// Code generated by command: go run sums_asm.go -pkg slices -out ../slices/sums_amd64.s -stubs ../slices/sums_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func sumUint64(x []uint64, y []uint64) // Requires: AVX, AVX2, CMOV TEXT ·sumUint64(SB), NOSPLIT, $0-48 XORQ CX, CX MOVQ x_base+0(FP), DX MOVQ y_base+24(FP), BX MOVQ x_len+8(FP), SI MOVQ y_len+32(FP), AX CMPQ AX, SI CMOVQLT AX, SI BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCC x86_loop avx2_loop: MOVQ CX, AX ADDQ $0x10, AX CMPQ AX, SI JAE x86_loop VMOVDQU (DX)(CX*8), Y0 VMOVDQU (BX)(CX*8), Y1 VMOVDQU 32(DX)(CX*8), Y2 VMOVDQU 32(BX)(CX*8), Y3 VMOVDQU 64(DX)(CX*8), Y4 VMOVDQU 64(BX)(CX*8), Y5 VMOVDQU 96(DX)(CX*8), Y6 VMOVDQU 96(BX)(CX*8), Y7 VPADDQ Y0, Y1, Y0 VPADDQ Y2, Y3, Y2 VPADDQ Y4, Y5, Y4 VPADDQ Y6, Y7, Y6 VMOVDQU Y0, (DX)(CX*8) VMOVDQU Y2, 32(DX)(CX*8) VMOVDQU Y4, 64(DX)(CX*8) VMOVDQU Y6, 96(DX)(CX*8) MOVQ AX, CX JMP avx2_loop x86_loop: CMPQ CX, SI JAE return MOVQ (BX)(CX*8), AX ADDQ AX, (DX)(CX*8) ADDQ $0x01, CX JMP x86_loop return: RET // func sumUint32(x []uint32, y []uint32) // Requires: AVX, AVX2, CMOV TEXT ·sumUint32(SB), NOSPLIT, $0-48 XORQ CX, CX MOVQ x_base+0(FP), DX MOVQ y_base+24(FP), BX MOVQ x_len+8(FP), SI MOVQ y_len+32(FP), AX CMPQ AX, SI CMOVQLT AX, SI BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCC x86_loop avx2_loop: MOVQ CX, AX ADDQ $0x20, AX CMPQ AX, SI JAE x86_loop VMOVDQU (DX)(CX*4), Y0 VMOVDQU (BX)(CX*4), Y1 VMOVDQU 32(DX)(CX*4), Y2 VMOVDQU 32(BX)(CX*4), Y3 VMOVDQU 64(DX)(CX*4), Y4 VMOVDQU 64(BX)(CX*4), Y5 VMOVDQU 96(DX)(CX*4), Y6 VMOVDQU 96(BX)(CX*4), Y7 VPADDD Y0, Y1, Y0 VPADDD Y2, Y3, Y2 VPADDD Y4, Y5, Y4 VPADDD Y6, Y7, Y6 VMOVDQU Y0, (DX)(CX*4) VMOVDQU Y2, 32(DX)(CX*4) VMOVDQU Y4, 64(DX)(CX*4) VMOVDQU Y6, 96(DX)(CX*4) MOVQ AX, CX JMP avx2_loop x86_loop: CMPQ CX, SI JAE return MOVL (BX)(CX*4), AX ADDL AX, (DX)(CX*4) ADDQ $0x01, CX JMP x86_loop return: RET // func sumUint16(x []uint16, y []uint16) // Requires: AVX, AVX2, CMOV TEXT ·sumUint16(SB), NOSPLIT, $0-48 XORQ CX, CX MOVQ x_base+0(FP), DX MOVQ y_base+24(FP), BX MOVQ x_len+8(FP), SI MOVQ y_len+32(FP), AX CMPQ AX, SI CMOVQLT AX, SI BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCC x86_loop avx2_loop: MOVQ CX, AX ADDQ $0x40, AX CMPQ AX, SI JAE x86_loop VMOVDQU (DX)(CX*2), Y0 VMOVDQU (BX)(CX*2), Y1 VMOVDQU 32(DX)(CX*2), Y2 VMOVDQU 32(BX)(CX*2), Y3 VMOVDQU 64(DX)(CX*2), Y4 VMOVDQU 64(BX)(CX*2), Y5 VMOVDQU 96(DX)(CX*2), Y6 VMOVDQU 96(BX)(CX*2), Y7 VPADDW Y0, Y1, Y0 VPADDW Y2, Y3, Y2 VPADDW Y4, Y5, Y4 VPADDW Y6, Y7, Y6 VMOVDQU Y0, (DX)(CX*2) VMOVDQU Y2, 32(DX)(CX*2) VMOVDQU Y4, 64(DX)(CX*2) VMOVDQU Y6, 96(DX)(CX*2) MOVQ AX, CX JMP avx2_loop x86_loop: CMPQ CX, SI JAE return MOVW (BX)(CX*2), AX ADDW AX, (DX)(CX*2) ADDQ $0x01, CX JMP x86_loop return: RET // func sumUint8(x []uint8, y []uint8) // Requires: AVX, AVX2, CMOV TEXT ·sumUint8(SB), NOSPLIT, $0-48 XORQ CX, CX MOVQ x_base+0(FP), DX MOVQ y_base+24(FP), BX MOVQ x_len+8(FP), SI MOVQ y_len+32(FP), AX CMPQ AX, SI CMOVQLT AX, SI BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCC x86_loop avx2_loop: MOVQ CX, AX ADDQ $0x80, AX CMPQ AX, SI JAE x86_loop VMOVDQU (DX)(CX*1), Y0 VMOVDQU (BX)(CX*1), Y1 VMOVDQU 32(DX)(CX*1), Y2 VMOVDQU 32(BX)(CX*1), Y3 VMOVDQU 64(DX)(CX*1), Y4 VMOVDQU 64(BX)(CX*1), Y5 VMOVDQU 96(DX)(CX*1), Y6 VMOVDQU 96(BX)(CX*1), Y7 VPADDB Y0, Y1, Y0 VPADDB Y2, Y3, Y2 VPADDB Y4, Y5, Y4 VPADDB Y6, Y7, Y6 VMOVDQU Y0, (DX)(CX*1) VMOVDQU Y2, 32(DX)(CX*1) VMOVDQU Y4, 64(DX)(CX*1) VMOVDQU Y6, 96(DX)(CX*1) MOVQ AX, CX JMP avx2_loop x86_loop: CMPQ CX, SI JAE return MOVB (BX)(CX*1), AL ADDB AL, (DX)(CX*1) ADDQ $0x01, CX JMP x86_loop return: RET golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/slices/sums_default.go000066400000000000000000000004551452252572700261400ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package slices func sumUint64(x, y []uint64) { sumUint64Generic(x, y) } func sumUint32(x, y []uint32) { sumUint32Generic(x, y) } func sumUint16(x, y []uint16) { sumUint16Generic(x, y) } func sumUint8(x, y []uint8) { sumUint8Generic(x, y) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/slices/sums_test.go000066400000000000000000000142571452252572700255000ustar00rootroot00000000000000package slices import ( "math/rand" "testing" ) var size = 1024 * 1024 func TestSumUint8(t *testing.T) { x, y := generateUint8Slices() genericXCopy := make([]uint8, len(x)) copy(genericXCopy, x) sumUint8(x, y) sumUint8Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func TestSumUint8YLarger(t *testing.T) { x, y := generateUint8Slices() y = append(y, uint8(100)) genericXCopy := make([]uint8, len(x)) copy(genericXCopy, x) sumUint8(x, y) sumUint8Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func TestSumUint8XLarger(t *testing.T) { x, y := generateUint8Slices() x = append(x, uint8(100)) genericXCopy := make([]uint8, len(x)) copy(genericXCopy, x) sumUint8(x, y) sumUint8Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func TestSumUint16(t *testing.T) { x, y := generateUint16Slices() genericXCopy := make([]uint16, len(x)) copy(genericXCopy, x) sumUint16(x, y) sumUint16Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func TestSumUint16YLarger(t *testing.T) { x, y := generateUint16Slices() y = append(y, uint16(100)) genericXCopy := make([]uint16, len(x)) copy(genericXCopy, x) sumUint16(x, y) sumUint16Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func TestSumUint16XLarger(t *testing.T) { x, y := generateUint16Slices() x = append(x, uint16(100)) genericXCopy := make([]uint16, len(x)) copy(genericXCopy, x) sumUint16(x, y) sumUint16Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func TestSumUint32(t *testing.T) { x, y := generateUint32Slices() genericXCopy := make([]uint32, len(x)) copy(genericXCopy, x) sumUint32(x, y) sumUint32Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func TestSumUint32YLarger(t *testing.T) { x, y := generateUint32Slices() y = append(y, uint32(100)) genericXCopy := make([]uint32, len(x)) copy(genericXCopy, x) sumUint32(x, y) sumUint32Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func TestSumUint32XLarger(t *testing.T) { x, y := generateUint32Slices() x = append(x, uint32(100)) genericXCopy := make([]uint32, len(x)) copy(genericXCopy, x) sumUint32(x, y) sumUint32Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func TestSumUint64(t *testing.T) { x, y := generateUint64Slices() genericXCopy := make([]uint64, len(x)) copy(genericXCopy, x) sumUint64(x, y) sumUint64Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func TestSumUint64YLarger(t *testing.T) { x, y := generateUint64Slices() y = append(y, uint64(100)) genericXCopy := make([]uint64, len(x)) copy(genericXCopy, x) sumUint64(x, y) sumUint64Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func TestSumUint64XLarger(t *testing.T) { x, y := generateUint64Slices() x = append(x, uint64(100)) genericXCopy := make([]uint64, len(x)) copy(genericXCopy, x) sumUint64(x, y) sumUint64Generic(genericXCopy, y) for i := 0; i < len(x); i++ { if x[i] != genericXCopy[i] { t.Fatalf("mismatch sums at index %d, expected %d : got %d", i, genericXCopy[i], x[i]) } } } func generateUint8Slices() ([]uint8, []uint8) { var x []uint8 var y []uint8 for i := 0; i < size; i++ { x = append(x, uint8(i)) y = append(y, uint8(i)) } return x, y } func generateUint16Slices() ([]uint16, []uint16) { var x []uint16 var y []uint16 for i := 0; i < size; i++ { x = append(x, uint16(i)) y = append(y, uint16(i)) } return x, y } func generateUint32Slices() ([]uint32, []uint32) { var x []uint32 var y []uint32 prng := rand.New(rand.NewSource(0)) for i := 0; i < size; i++ { x = append(x, prng.Uint32()) y = append(y, prng.Uint32()) } return x, y } func generateUint64Slices() ([]uint64, []uint64) { var x []uint64 var y []uint64 prng := rand.New(rand.NewSource(0)) for i := 0; i < size; i++ { x = append(x, prng.Uint64()) y = append(y, prng.Uint64()) } return x, y } func BenchmarkSumUnit8(b *testing.B) { x, y := generateUint8Slices() for i := 0; i < b.N; i++ { SumUint8(x, y) } } func BenchmarkSumUnit8Generic(b *testing.B) { x, y := generateUint8Slices() for i := 0; i < b.N; i++ { sumUint8Generic(x, y) } } func BenchmarkSumUnit16(b *testing.B) { x, y := generateUint16Slices() for i := 0; i < b.N; i++ { SumUint16(x, y) } } func BenchmarkSumUnit16Generic(b *testing.B) { x, y := generateUint16Slices() for i := 0; i < b.N; i++ { sumUint16Generic(x, y) } } func BenchmarkSumUnit32(b *testing.B) { x, y := generateUint32Slices() for i := 0; i < b.N; i++ { SumUint32(x, y) } } func BenchmarkSumUnit32Generic(b *testing.B) { x, y := generateUint32Slices() for i := 0; i < b.N; i++ { sumUint32Generic(x, y) } } func BenchmarkSumUnit64(b *testing.B) { x, y := generateUint64Slices() for i := 0; i < b.N; i++ { SumUint64(x, y) } } func BenchmarkSumUnit64Generic(b *testing.B) { x, y := generateUint64Slices() for i := 0; i < b.N; i++ { sumUint64Generic(x, y) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/000077500000000000000000000000001452252572700236445ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/dedupe.go000066400000000000000000000025361452252572700254470ustar00rootroot00000000000000package sortedset import ( "bytes" "github.com/segmentio/asm/internal" ) // Dedupe writes to dst the deduplicated sequence of items of the given size // read from src, returning the byte slice containing the result. // // If dst is too small, a new slice is allocated and returned instead. // // The source and destination slices may be the same to perform in-place // deduplication of the elements. The behavior is undefined for any other // conditions where the source and destination slices overlap. // // The function panics if len(src) is not a multiple of the element size. func Dedupe(dst, src []byte, size int) []byte { if !internal.MultipleOf(size, len(src)) { panic("input length is not a multiple of the item size") } if len(dst) < len(src) { dst = make([]byte, len(src)) } var n int switch size { case 1: n = dedupe1(dst, src) case 2: n = dedupe2(dst, src) case 4: n = dedupe4(dst, src) case 8: n = dedupe8(dst, src) case 16: n = dedupe16(dst, src) case 32: n = dedupe32(dst, src) default: n = dedupeGeneric(dst, src, size) } return dst[:n] } func dedupeGeneric(dst, src []byte, size int) int { if len(src) == 0 { return 0 } i := size j := size copy(dst, src[:size]) for i < len(src) { if !bytes.Equal(src[i-size:i], src[i:i+size]) { copy(dst[j:], src[i:i+size]) j += size } i += size } return j } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/dedupe_amd64.go000066400000000000000000000006721452252572700264410ustar00rootroot00000000000000// Code generated by command: go run dedupe_asm.go -pkg sortedset -out ../sortedset/dedupe_amd64.s -stubs ../sortedset/dedupe_amd64.go. DO NOT EDIT. //go:build !purego package sortedset func dedupe1(dst []byte, src []byte) int func dedupe2(dst []byte, src []byte) int func dedupe4(dst []byte, src []byte) int func dedupe8(dst []byte, src []byte) int func dedupe16(dst []byte, src []byte) int func dedupe32(dst []byte, src []byte) int golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/dedupe_amd64.s000066400000000000000000000520251452252572700262750ustar00rootroot00000000000000// Code generated by command: go run dedupe_asm.go -pkg sortedset -out ../sortedset/dedupe_amd64.s -stubs ../sortedset/dedupe_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func dedupe1(dst []byte, src []byte) int // Requires: CMOV TEXT ·dedupe1(SB), NOSPLIT, $0-56 MOVQ src_len+32(FP), AX CMPQ AX, $0x00 JE short MOVQ dst_base+0(FP), CX MOVQ src_base+24(FP), DX MOVQ CX, BX SUBQ $0x01, AX MOVB (DX), SI MOVB SI, (BX) ADDQ $0x01, BX CMPQ AX, $0x00 JE done generic: MOVQ BX, SI ADDQ $0x01, SI MOVB (DX), DI MOVB 1(DX), R8 MOVB R8, (BX) CMPB DI, R8 CMOVQNE SI, BX ADDQ $0x01, DX SUBQ $0x01, AX CMPQ AX, $0x00 JG generic done: SUBQ CX, BX MOVQ BX, ret+48(FP) RET short: MOVQ AX, ret+48(FP) RET // func dedupe2(dst []byte, src []byte) int // Requires: CMOV TEXT ·dedupe2(SB), NOSPLIT, $0-56 MOVQ src_len+32(FP), AX CMPQ AX, $0x00 JE short MOVQ dst_base+0(FP), CX MOVQ src_base+24(FP), DX MOVQ CX, BX SUBQ $0x02, AX MOVW (DX), SI MOVW SI, (BX) ADDQ $0x02, BX CMPQ AX, $0x00 JE done generic: MOVQ BX, SI ADDQ $0x02, SI MOVW (DX), DI MOVW 2(DX), R8 MOVW R8, (BX) CMPW DI, R8 CMOVQNE SI, BX ADDQ $0x02, DX SUBQ $0x02, AX CMPQ AX, $0x00 JG generic done: SUBQ CX, BX MOVQ BX, ret+48(FP) RET short: MOVQ AX, ret+48(FP) RET // func dedupe4(dst []byte, src []byte) int // Requires: AVX, CMOV TEXT ·dedupe4(SB), NOSPLIT, $0-56 MOVQ src_len+32(FP), AX CMPQ AX, $0x00 JE short MOVQ dst_base+0(FP), CX MOVQ src_base+24(FP), DX MOVQ CX, BX SUBQ $0x04, AX CMPQ AX, $0x10 JL init BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 init: MOVL (DX), SI MOVL SI, (BX) ADDQ $0x04, BX tail: CMPQ AX, $0x00 JE done generic: MOVQ BX, SI ADDQ $0x04, SI MOVL (DX), DI MOVL 4(DX), R8 MOVL R8, (BX) CMPL DI, R8 CMOVQNE SI, BX ADDQ $0x04, DX SUBQ $0x04, AX CMPQ AX, $0x00 JG generic done: SUBQ CX, BX MOVQ BX, ret+48(FP) RET short: MOVQ AX, ret+48(FP) RET avx2: MOVL (DX), SI MOVL SI, (BX) LEAQ dedupe4_shuffle_mask<>+0(SB), R14 LEAQ dedupe4_offset_array<>+0(SB), R15 ADDQ $0x04, BX CMPQ AX, $0x00000080 JL avx2_tail64 avx2_loop128: VMOVDQU (DX), X0 VMOVDQU 16(DX), X2 VMOVDQU 32(DX), X4 VMOVDQU 48(DX), X6 VMOVDQU 64(DX), X8 VMOVDQU 80(DX), X10 VMOVDQU 96(DX), X12 VMOVDQU 112(DX), X14 VMOVDQU 4(DX), X1 VMOVDQU 20(DX), X3 VMOVDQU 36(DX), X5 VMOVDQU 52(DX), X7 VMOVDQU 68(DX), X9 VMOVDQU 84(DX), X11 VMOVDQU 100(DX), X13 VMOVDQU 116(DX), X15 VPCMPEQD X1, X0, X0 VMOVMSKPS X0, SI SHLQ $0x02, SI VPSHUFB (R14)(SI*4), X1, X1 MOVL (R15)(SI*1), SI VPCMPEQD X3, X2, X2 VMOVMSKPS X2, DI SHLQ $0x02, DI VPSHUFB (R14)(DI*4), X3, X3 MOVL (R15)(DI*1), DI ADDQ SI, DI VPCMPEQD X5, X4, X4 VMOVMSKPS X4, R8 SHLQ $0x02, R8 VPSHUFB (R14)(R8*4), X5, X5 MOVL (R15)(R8*1), R8 ADDQ DI, R8 VPCMPEQD X7, X6, X6 VMOVMSKPS X6, R9 SHLQ $0x02, R9 VPSHUFB (R14)(R9*4), X7, X7 MOVL (R15)(R9*1), R9 ADDQ R8, R9 VPCMPEQD X9, X8, X8 VMOVMSKPS X8, R10 SHLQ $0x02, R10 VPSHUFB (R14)(R10*4), X9, X9 MOVL (R15)(R10*1), R10 ADDQ R9, R10 VPCMPEQD X11, X10, X10 VMOVMSKPS X10, R11 SHLQ $0x02, R11 VPSHUFB (R14)(R11*4), X11, X11 MOVL (R15)(R11*1), R11 ADDQ R10, R11 VPCMPEQD X13, X12, X12 VMOVMSKPS X12, R12 SHLQ $0x02, R12 VPSHUFB (R14)(R12*4), X13, X13 MOVL (R15)(R12*1), R12 ADDQ R11, R12 VPCMPEQD X15, X14, X14 VMOVMSKPS X14, R13 SHLQ $0x02, R13 VPSHUFB (R14)(R13*4), X15, X15 MOVL (R15)(R13*1), R13 ADDQ R12, R13 VMOVDQU X1, (BX) VMOVDQU X3, (BX)(SI*1) VMOVDQU X5, (BX)(DI*1) VMOVDQU X7, (BX)(R8*1) VMOVDQU X9, (BX)(R9*1) VMOVDQU X11, (BX)(R10*1) VMOVDQU X13, (BX)(R11*1) VMOVDQU X15, (BX)(R12*1) ADDQ R13, BX ADDQ $0x00000080, DX SUBQ $0x00000080, AX CMPQ AX, $0x00000080 JGE avx2_loop128 avx2_tail64: CMPQ AX, $0x40 JL avx2_tail32 VMOVDQU (DX), X0 VMOVDQU 16(DX), X2 VMOVDQU 32(DX), X4 VMOVDQU 48(DX), X6 VMOVDQU 4(DX), X1 VMOVDQU 20(DX), X3 VMOVDQU 36(DX), X5 VMOVDQU 52(DX), X7 VPCMPEQD X1, X0, X0 VMOVMSKPS X0, SI SHLQ $0x02, SI VPSHUFB (R14)(SI*4), X1, X1 MOVL (R15)(SI*1), SI VPCMPEQD X3, X2, X2 VMOVMSKPS X2, DI SHLQ $0x02, DI VPSHUFB (R14)(DI*4), X3, X3 MOVL (R15)(DI*1), DI ADDQ SI, DI VPCMPEQD X5, X4, X4 VMOVMSKPS X4, R8 SHLQ $0x02, R8 VPSHUFB (R14)(R8*4), X5, X5 MOVL (R15)(R8*1), R8 ADDQ DI, R8 VPCMPEQD X7, X6, X6 VMOVMSKPS X6, R9 SHLQ $0x02, R9 VPSHUFB (R14)(R9*4), X7, X7 MOVL (R15)(R9*1), R9 ADDQ R8, R9 VMOVDQU X1, (BX) VMOVDQU X3, (BX)(SI*1) VMOVDQU X5, (BX)(DI*1) VMOVDQU X7, (BX)(R8*1) ADDQ R9, BX ADDQ $0x40, DX SUBQ $0x40, AX avx2_tail32: CMPQ AX, $0x20 JL avx2_tail16 VMOVDQU (DX), X0 VMOVDQU 16(DX), X2 VMOVDQU 4(DX), X1 VMOVDQU 20(DX), X3 VPCMPEQD X1, X0, X0 VMOVMSKPS X0, SI SHLQ $0x02, SI VPSHUFB (R14)(SI*4), X1, X1 MOVL (R15)(SI*1), SI VPCMPEQD X3, X2, X2 VMOVMSKPS X2, DI SHLQ $0x02, DI VPSHUFB (R14)(DI*4), X3, X3 MOVL (R15)(DI*1), DI ADDQ SI, DI VMOVDQU X1, (BX) VMOVDQU X3, (BX)(SI*1) ADDQ DI, BX ADDQ $0x20, DX SUBQ $0x20, AX avx2_tail16: CMPQ AX, $0x10 JL avx2_tail VMOVDQU (DX), X0 VMOVDQU 4(DX), X1 VPCMPEQD X1, X0, X0 VMOVMSKPS X0, SI SHLQ $0x02, SI VPSHUFB (R14)(SI*4), X1, X1 MOVL (R15)(SI*1), SI VMOVDQU X1, (BX) ADDQ SI, BX ADDQ $0x10, DX SUBQ $0x10, AX avx2_tail: VZEROUPPER JMP tail DATA dedupe4_shuffle_mask<>+0(SB)/8, $0x0706050403020100 DATA dedupe4_shuffle_mask<>+8(SB)/8, $0x0f0e0d0c0b0a0908 DATA dedupe4_shuffle_mask<>+16(SB)/8, $0x0b0a090807060504 DATA dedupe4_shuffle_mask<>+24(SB)/8, $0x030201000f0e0d0c DATA dedupe4_shuffle_mask<>+32(SB)/8, $0x0b0a090803020100 DATA dedupe4_shuffle_mask<>+40(SB)/8, $0x070605040f0e0d0c DATA dedupe4_shuffle_mask<>+48(SB)/8, $0x0f0e0d0c0b0a0908 DATA dedupe4_shuffle_mask<>+56(SB)/8, $0x0706050403020100 DATA dedupe4_shuffle_mask<>+64(SB)/8, $0x0706050403020100 DATA dedupe4_shuffle_mask<>+72(SB)/8, $0x0b0a09080f0e0d0c DATA dedupe4_shuffle_mask<>+80(SB)/8, $0x0f0e0d0c07060504 DATA dedupe4_shuffle_mask<>+88(SB)/8, $0x0b0a090803020100 DATA dedupe4_shuffle_mask<>+96(SB)/8, $0x0f0e0d0c03020100 DATA dedupe4_shuffle_mask<>+104(SB)/8, $0x0b0a090807060504 DATA dedupe4_shuffle_mask<>+112(SB)/8, $0x030201000f0e0d0c DATA dedupe4_shuffle_mask<>+120(SB)/8, $0x0b0a090807060504 DATA dedupe4_shuffle_mask<>+128(SB)/8, $0x0706050403020100 DATA dedupe4_shuffle_mask<>+136(SB)/8, $0x0f0e0d0c0b0a0908 DATA dedupe4_shuffle_mask<>+144(SB)/8, $0x0b0a090807060504 DATA dedupe4_shuffle_mask<>+152(SB)/8, $0x0f0e0d0c03020100 DATA dedupe4_shuffle_mask<>+160(SB)/8, $0x0f0e0d0c03020100 DATA dedupe4_shuffle_mask<>+168(SB)/8, $0x0b0a090807060504 DATA dedupe4_shuffle_mask<>+176(SB)/8, $0x030201000b0a0908 DATA dedupe4_shuffle_mask<>+184(SB)/8, $0x0f0e0d0c07060504 DATA dedupe4_shuffle_mask<>+192(SB)/8, $0x0706050403020100 DATA dedupe4_shuffle_mask<>+200(SB)/8, $0x0f0e0d0c0b0a0908 DATA dedupe4_shuffle_mask<>+208(SB)/8, $0x0302010007060504 DATA dedupe4_shuffle_mask<>+216(SB)/8, $0x0f0e0d0c0b0a0908 DATA dedupe4_shuffle_mask<>+224(SB)/8, $0x0706050403020100 DATA dedupe4_shuffle_mask<>+232(SB)/8, $0x0f0e0d0c0b0a0908 DATA dedupe4_shuffle_mask<>+240(SB)/8, $0x0706050403020100 DATA dedupe4_shuffle_mask<>+248(SB)/8, $0x0f0e0d0c0b0a0908 GLOBL dedupe4_shuffle_mask<>(SB), RODATA|NOPTR, $256 DATA dedupe4_offset_array<>+0(SB)/8, $0x0000000c00000010 DATA dedupe4_offset_array<>+8(SB)/8, $0x000000080000000c DATA dedupe4_offset_array<>+16(SB)/8, $0x000000080000000c DATA dedupe4_offset_array<>+24(SB)/8, $0x0000000400000008 DATA dedupe4_offset_array<>+32(SB)/8, $0x000000080000000c DATA dedupe4_offset_array<>+40(SB)/8, $0x0000000400000008 DATA dedupe4_offset_array<>+48(SB)/8, $0x0000000400000008 DATA dedupe4_offset_array<>+56(SB)/8, $0x0000000000000004 GLOBL dedupe4_offset_array<>(SB), RODATA|NOPTR, $64 // func dedupe8(dst []byte, src []byte) int // Requires: AVX, CMOV TEXT ·dedupe8(SB), NOSPLIT, $0-56 MOVQ src_len+32(FP), AX CMPQ AX, $0x00 JE short MOVQ dst_base+0(FP), CX MOVQ src_base+24(FP), DX MOVQ CX, BX SUBQ $0x08, AX CMPQ AX, $0x10 JL init BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 init: MOVQ (DX), SI MOVQ SI, (BX) ADDQ $0x08, BX tail: CMPQ AX, $0x00 JE done generic: MOVQ BX, SI ADDQ $0x08, SI MOVQ (DX), DI MOVQ 8(DX), R8 MOVQ R8, (BX) CMPQ DI, R8 CMOVQNE SI, BX ADDQ $0x08, DX SUBQ $0x08, AX CMPQ AX, $0x00 JG generic done: SUBQ CX, BX MOVQ BX, ret+48(FP) RET short: MOVQ AX, ret+48(FP) RET avx2: MOVQ (DX), SI MOVQ SI, (BX) LEAQ dedupe8_shuffle_mask<>+0(SB), R14 LEAQ dedupe8_offset_array<>+0(SB), R15 ADDQ $0x08, BX CMPQ AX, $0x00000080 JL avx2_tail64 avx2_loop128: VMOVDQU (DX), X0 VMOVDQU 16(DX), X2 VMOVDQU 32(DX), X4 VMOVDQU 48(DX), X6 VMOVDQU 64(DX), X8 VMOVDQU 80(DX), X10 VMOVDQU 96(DX), X12 VMOVDQU 112(DX), X14 VMOVDQU 8(DX), X1 VMOVDQU 24(DX), X3 VMOVDQU 40(DX), X5 VMOVDQU 56(DX), X7 VMOVDQU 72(DX), X9 VMOVDQU 88(DX), X11 VMOVDQU 104(DX), X13 VMOVDQU 120(DX), X15 VPCMPEQQ X1, X0, X0 VMOVMSKPD X0, SI VPSHUFB (R14)(SI*8), X1, X1 MOVQ (R15)(SI*8), SI VPCMPEQQ X3, X2, X2 VMOVMSKPD X2, DI VPSHUFB (R14)(DI*8), X3, X3 MOVQ (R15)(DI*8), DI ADDQ SI, DI VPCMPEQQ X5, X4, X4 VMOVMSKPD X4, R8 VPSHUFB (R14)(R8*8), X5, X5 MOVQ (R15)(R8*8), R8 ADDQ DI, R8 VPCMPEQQ X7, X6, X6 VMOVMSKPD X6, R9 VPSHUFB (R14)(R9*8), X7, X7 MOVQ (R15)(R9*8), R9 ADDQ R8, R9 VPCMPEQQ X9, X8, X8 VMOVMSKPD X8, R10 VPSHUFB (R14)(R10*8), X9, X9 MOVQ (R15)(R10*8), R10 ADDQ R9, R10 VPCMPEQQ X11, X10, X10 VMOVMSKPD X10, R11 VPSHUFB (R14)(R11*8), X11, X11 MOVQ (R15)(R11*8), R11 ADDQ R10, R11 VPCMPEQQ X13, X12, X12 VMOVMSKPD X12, R12 VPSHUFB (R14)(R12*8), X13, X13 MOVQ (R15)(R12*8), R12 ADDQ R11, R12 VPCMPEQQ X15, X14, X14 VMOVMSKPD X14, R13 VPSHUFB (R14)(R13*8), X15, X15 MOVQ (R15)(R13*8), R13 ADDQ R12, R13 VMOVDQU X1, (BX) VMOVDQU X3, (BX)(SI*1) VMOVDQU X5, (BX)(DI*1) VMOVDQU X7, (BX)(R8*1) VMOVDQU X9, (BX)(R9*1) VMOVDQU X11, (BX)(R10*1) VMOVDQU X13, (BX)(R11*1) VMOVDQU X15, (BX)(R12*1) ADDQ R13, BX ADDQ $0x00000080, DX SUBQ $0x00000080, AX CMPQ AX, $0x00000080 JGE avx2_loop128 avx2_tail64: CMPQ AX, $0x40 JL avx2_tail32 VMOVDQU (DX), X0 VMOVDQU 16(DX), X2 VMOVDQU 32(DX), X4 VMOVDQU 48(DX), X6 VMOVDQU 8(DX), X1 VMOVDQU 24(DX), X3 VMOVDQU 40(DX), X5 VMOVDQU 56(DX), X7 VPCMPEQQ X1, X0, X0 VMOVMSKPD X0, SI VPSHUFB (R14)(SI*8), X1, X1 MOVQ (R15)(SI*8), SI VPCMPEQQ X3, X2, X2 VMOVMSKPD X2, DI VPSHUFB (R14)(DI*8), X3, X3 MOVQ (R15)(DI*8), DI ADDQ SI, DI VPCMPEQQ X5, X4, X4 VMOVMSKPD X4, R8 VPSHUFB (R14)(R8*8), X5, X5 MOVQ (R15)(R8*8), R8 ADDQ DI, R8 VPCMPEQQ X7, X6, X6 VMOVMSKPD X6, R9 VPSHUFB (R14)(R9*8), X7, X7 MOVQ (R15)(R9*8), R9 ADDQ R8, R9 VMOVDQU X1, (BX) VMOVDQU X3, (BX)(SI*1) VMOVDQU X5, (BX)(DI*1) VMOVDQU X7, (BX)(R8*1) ADDQ R9, BX ADDQ $0x40, DX SUBQ $0x40, AX avx2_tail32: CMPQ AX, $0x20 JL avx2_tail16 VMOVDQU (DX), X0 VMOVDQU 16(DX), X2 VMOVDQU 8(DX), X1 VMOVDQU 24(DX), X3 VPCMPEQQ X1, X0, X0 VMOVMSKPD X0, SI VPSHUFB (R14)(SI*8), X1, X1 MOVQ (R15)(SI*8), SI VPCMPEQQ X3, X2, X2 VMOVMSKPD X2, DI VPSHUFB (R14)(DI*8), X3, X3 MOVQ (R15)(DI*8), DI ADDQ SI, DI VMOVDQU X1, (BX) VMOVDQU X3, (BX)(SI*1) ADDQ DI, BX ADDQ $0x20, DX SUBQ $0x20, AX avx2_tail16: CMPQ AX, $0x10 JL avx2_tail VMOVDQU (DX), X0 VMOVDQU 8(DX), X1 VPCMPEQQ X1, X0, X0 VMOVMSKPD X0, SI VPSHUFB (R14)(SI*8), X1, X1 MOVQ (R15)(SI*8), SI VMOVDQU X1, (BX) ADDQ SI, BX ADDQ $0x10, DX SUBQ $0x10, AX avx2_tail: VZEROUPPER JMP tail DATA dedupe8_shuffle_mask<>+0(SB)/8, $0x0706050403020100 DATA dedupe8_shuffle_mask<>+8(SB)/8, $0x0f0e0d0c0b0a0908 DATA dedupe8_shuffle_mask<>+16(SB)/8, $0x0706050403020100 DATA dedupe8_shuffle_mask<>+24(SB)/8, $0x0706050403020100 DATA dedupe8_shuffle_mask<>+32(SB)/8, $0x0706050403020100 GLOBL dedupe8_shuffle_mask<>(SB), RODATA|NOPTR, $40 DATA dedupe8_offset_array<>+0(SB)/8, $0x0000000000000010 DATA dedupe8_offset_array<>+8(SB)/8, $0x0000000000000008 DATA dedupe8_offset_array<>+16(SB)/8, $0x0000000000000008 DATA dedupe8_offset_array<>+24(SB)/8, $0x0000000000000000 GLOBL dedupe8_offset_array<>(SB), RODATA|NOPTR, $32 // func dedupe16(dst []byte, src []byte) int // Requires: AVX, CMOV, SSE2, SSE4.1 TEXT ·dedupe16(SB), NOSPLIT, $0-56 MOVQ src_len+32(FP), AX CMPQ AX, $0x00 JE short MOVQ dst_base+0(FP), CX MOVQ src_base+24(FP), DX MOVQ CX, BX SUBQ $0x10, AX CMPQ AX, $0x10 JL init BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 init: MOVOU (DX), X0 MOVOU X0, (BX) ADDQ $0x10, BX tail: CMPQ AX, $0x00 JE done generic: MOVQ BX, SI ADDQ $0x10, SI MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU X1, (BX) PCMPEQQ X0, X1 PMOVMSKB X1, DI CMPL DI, $0x0000ffff CMOVQNE SI, BX ADDQ $0x10, DX SUBQ $0x10, AX CMPQ AX, $0x00 JG generic done: SUBQ CX, BX MOVQ BX, ret+48(FP) RET short: MOVQ AX, ret+48(FP) RET avx2: VMOVDQU (DX), X0 VMOVDQU X0, (BX) XORQ R14, R14 MOVQ $0x0000000000000010, R15 ADDQ $0x10, BX CMPQ AX, $0x00000080 JL avx2_tail64 avx2_loop128: VMOVDQU (DX), X0 VMOVDQU 16(DX), X2 VMOVDQU 32(DX), X4 VMOVDQU 48(DX), X5 VMOVDQU 64(DX), X7 VMOVDQU 80(DX), X8 VMOVDQU 96(DX), X9 VMOVDQU 112(DX), X10 VMOVDQU 128(DX), X11 VMOVDQA X2, X1 VPCMPEQQ X2, X0, X0 VMOVMSKPD X0, SI CMPQ SI, $0x03 CMOVQEQ R14, SI CMOVQNE R15, SI VMOVDQA X4, X3 VPCMPEQQ X4, X2, X2 VMOVMSKPD X2, DI CMPQ DI, $0x03 CMOVQEQ R14, DI CMOVQNE R15, DI ADDQ SI, DI VMOVDQA X5, X0 VPCMPEQQ X5, X4, X4 VMOVMSKPD X4, R8 CMPQ R8, $0x03 CMOVQEQ R14, R8 CMOVQNE R15, R8 ADDQ DI, R8 VMOVDQA X7, X6 VPCMPEQQ X7, X5, X5 VMOVMSKPD X5, R9 CMPQ R9, $0x03 CMOVQEQ R14, R9 CMOVQNE R15, R9 ADDQ R8, R9 VMOVDQA X8, X2 VPCMPEQQ X8, X7, X7 VMOVMSKPD X7, R10 CMPQ R10, $0x03 CMOVQEQ R14, R10 CMOVQNE R15, R10 ADDQ R9, R10 VMOVDQA X9, X4 VPCMPEQQ X9, X8, X8 VMOVMSKPD X8, R11 CMPQ R11, $0x03 CMOVQEQ R14, R11 CMOVQNE R15, R11 ADDQ R10, R11 VMOVDQA X10, X5 VPCMPEQQ X10, X9, X9 VMOVMSKPD X9, R12 CMPQ R12, $0x03 CMOVQEQ R14, R12 CMOVQNE R15, R12 ADDQ R11, R12 VPCMPEQQ X11, X10, X10 VMOVMSKPD X10, R13 CMPQ R13, $0x03 CMOVQEQ R14, R13 CMOVQNE R15, R13 ADDQ R12, R13 VMOVDQU X1, (BX) VMOVDQU X3, (BX)(SI*1) VMOVDQU X0, (BX)(DI*1) VMOVDQU X6, (BX)(R8*1) VMOVDQU X2, (BX)(R9*1) VMOVDQU X4, (BX)(R10*1) VMOVDQU X5, (BX)(R11*1) VMOVDQU X11, (BX)(R12*1) ADDQ R13, BX ADDQ $0x00000080, DX SUBQ $0x00000080, AX CMPQ AX, $0x00000080 JGE avx2_loop128 avx2_tail64: CMPQ AX, $0x40 JL avx2_tail32 VMOVDQU (DX), X0 VMOVDQU 16(DX), X2 VMOVDQU 32(DX), X4 VMOVDQU 48(DX), X5 VMOVDQU 64(DX), X6 VMOVDQA X2, X1 VPCMPEQQ X2, X0, X0 VMOVMSKPD X0, SI CMPQ SI, $0x03 CMOVQEQ R14, SI CMOVQNE R15, SI VMOVDQA X4, X3 VPCMPEQQ X4, X2, X2 VMOVMSKPD X2, DI CMPQ DI, $0x03 CMOVQEQ R14, DI CMOVQNE R15, DI ADDQ SI, DI VMOVDQA X5, X0 VPCMPEQQ X5, X4, X4 VMOVMSKPD X4, R8 CMPQ R8, $0x03 CMOVQEQ R14, R8 CMOVQNE R15, R8 ADDQ DI, R8 VPCMPEQQ X6, X5, X5 VMOVMSKPD X5, R9 CMPQ R9, $0x03 CMOVQEQ R14, R9 CMOVQNE R15, R9 ADDQ R8, R9 VMOVDQU X1, (BX) VMOVDQU X3, (BX)(SI*1) VMOVDQU X0, (BX)(DI*1) VMOVDQU X6, (BX)(R8*1) ADDQ R9, BX ADDQ $0x40, DX SUBQ $0x40, AX avx2_tail32: CMPQ AX, $0x20 JL avx2_tail16 VMOVDQU (DX), X0 VMOVDQU 16(DX), X2 VMOVDQU 32(DX), X3 VMOVDQA X2, X1 VPCMPEQQ X2, X0, X0 VMOVMSKPD X0, SI CMPQ SI, $0x03 CMOVQEQ R14, SI CMOVQNE R15, SI VPCMPEQQ X3, X2, X2 VMOVMSKPD X2, DI CMPQ DI, $0x03 CMOVQEQ R14, DI CMOVQNE R15, DI ADDQ SI, DI VMOVDQU X1, (BX) VMOVDQU X3, (BX)(SI*1) ADDQ DI, BX ADDQ $0x20, DX SUBQ $0x20, AX avx2_tail16: CMPQ AX, $0x10 JL avx2_tail VMOVDQU (DX), X0 VMOVDQU 16(DX), X1 VPCMPEQQ X1, X0, X0 VMOVMSKPD X0, SI CMPQ SI, $0x03 CMOVQEQ R14, SI CMOVQNE R15, SI VMOVDQU X1, (BX) ADDQ SI, BX ADDQ $0x10, DX SUBQ $0x10, AX avx2_tail: VZEROUPPER JMP tail // func dedupe32(dst []byte, src []byte) int // Requires: AVX, AVX2, CMOV, SSE2, SSE4.1 TEXT ·dedupe32(SB), NOSPLIT, $0-56 MOVQ src_len+32(FP), AX CMPQ AX, $0x00 JE short MOVQ dst_base+0(FP), CX MOVQ src_base+24(FP), DX MOVQ CX, BX SUBQ $0x20, AX CMPQ AX, $0x20 JL init BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB) JCS avx2 init: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU X0, (BX) MOVOU X1, 16(BX) ADDQ $0x20, BX tail: CMPQ AX, $0x00 JE done generic: MOVQ BX, SI ADDQ $0x20, SI MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU 32(DX), X2 MOVOU 48(DX), X3 MOVOU X2, (BX) MOVOU X3, 16(BX) PCMPEQQ X0, X2 PCMPEQQ X1, X3 PMOVMSKB X2, DI PMOVMSKB X3, R8 ANDL R8, DI CMPL DI, $0x0000ffff CMOVQNE SI, BX ADDQ $0x20, DX SUBQ $0x20, AX CMPQ AX, $0x00 JG generic done: SUBQ CX, BX MOVQ BX, ret+48(FP) RET short: MOVQ AX, ret+48(FP) RET avx2: VMOVDQU (DX), Y0 VMOVDQU Y0, (BX) XORQ R14, R14 MOVQ $0x0000000000000020, R15 ADDQ $0x20, BX CMPQ AX, $0x00000100 JL avx2_tail128 avx2_loop256: VMOVDQU (DX), Y0 VMOVDQU 32(DX), Y2 VMOVDQU 64(DX), Y4 VMOVDQU 96(DX), Y5 VMOVDQU 128(DX), Y7 VMOVDQU 160(DX), Y8 VMOVDQU 192(DX), Y9 VMOVDQU 224(DX), Y10 VMOVDQU 256(DX), Y11 VMOVDQA Y2, Y1 VPCMPEQQ Y2, Y0, Y0 VMOVMSKPD Y0, SI CMPQ SI, $0x0f CMOVQEQ R14, SI CMOVQNE R15, SI VMOVDQA Y4, Y3 VPCMPEQQ Y4, Y2, Y2 VMOVMSKPD Y2, DI CMPQ DI, $0x0f CMOVQEQ R14, DI CMOVQNE R15, DI ADDQ SI, DI VMOVDQA Y5, Y0 VPCMPEQQ Y5, Y4, Y4 VMOVMSKPD Y4, R8 CMPQ R8, $0x0f CMOVQEQ R14, R8 CMOVQNE R15, R8 ADDQ DI, R8 VMOVDQA Y7, Y6 VPCMPEQQ Y7, Y5, Y5 VMOVMSKPD Y5, R9 CMPQ R9, $0x0f CMOVQEQ R14, R9 CMOVQNE R15, R9 ADDQ R8, R9 VMOVDQA Y8, Y2 VPCMPEQQ Y8, Y7, Y7 VMOVMSKPD Y7, R10 CMPQ R10, $0x0f CMOVQEQ R14, R10 CMOVQNE R15, R10 ADDQ R9, R10 VMOVDQA Y9, Y4 VPCMPEQQ Y9, Y8, Y8 VMOVMSKPD Y8, R11 CMPQ R11, $0x0f CMOVQEQ R14, R11 CMOVQNE R15, R11 ADDQ R10, R11 VMOVDQA Y10, Y5 VPCMPEQQ Y10, Y9, Y9 VMOVMSKPD Y9, R12 CMPQ R12, $0x0f CMOVQEQ R14, R12 CMOVQNE R15, R12 ADDQ R11, R12 VPCMPEQQ Y11, Y10, Y10 VMOVMSKPD Y10, R13 CMPQ R13, $0x0f CMOVQEQ R14, R13 CMOVQNE R15, R13 ADDQ R12, R13 VMOVDQU Y1, (BX) VMOVDQU Y3, (BX)(SI*1) VMOVDQU Y0, (BX)(DI*1) VMOVDQU Y6, (BX)(R8*1) VMOVDQU Y2, (BX)(R9*1) VMOVDQU Y4, (BX)(R10*1) VMOVDQU Y5, (BX)(R11*1) VMOVDQU Y11, (BX)(R12*1) ADDQ R13, BX ADDQ $0x00000100, DX SUBQ $0x00000100, AX CMPQ AX, $0x00000100 JGE avx2_loop256 avx2_tail128: CMPQ AX, $0x80 JL avx2_tail64 VMOVDQU (DX), Y0 VMOVDQU 32(DX), Y2 VMOVDQU 64(DX), Y4 VMOVDQU 96(DX), Y5 VMOVDQU 128(DX), Y6 VMOVDQA Y2, Y1 VPCMPEQQ Y2, Y0, Y0 VMOVMSKPD Y0, SI CMPQ SI, $0x0f CMOVQEQ R14, SI CMOVQNE R15, SI VMOVDQA Y4, Y3 VPCMPEQQ Y4, Y2, Y2 VMOVMSKPD Y2, DI CMPQ DI, $0x0f CMOVQEQ R14, DI CMOVQNE R15, DI ADDQ SI, DI VMOVDQA Y5, Y0 VPCMPEQQ Y5, Y4, Y4 VMOVMSKPD Y4, R8 CMPQ R8, $0x0f CMOVQEQ R14, R8 CMOVQNE R15, R8 ADDQ DI, R8 VPCMPEQQ Y6, Y5, Y5 VMOVMSKPD Y5, R9 CMPQ R9, $0x0f CMOVQEQ R14, R9 CMOVQNE R15, R9 ADDQ R8, R9 VMOVDQU Y1, (BX) VMOVDQU Y3, (BX)(SI*1) VMOVDQU Y0, (BX)(DI*1) VMOVDQU Y6, (BX)(R8*1) ADDQ R9, BX ADDQ $0x80, DX SUBQ $0x80, AX avx2_tail64: CMPQ AX, $0x40 JL avx2_tail32 VMOVDQU (DX), Y0 VMOVDQU 32(DX), Y2 VMOVDQU 64(DX), Y3 VMOVDQA Y2, Y1 VPCMPEQQ Y2, Y0, Y0 VMOVMSKPD Y0, SI CMPQ SI, $0x0f CMOVQEQ R14, SI CMOVQNE R15, SI VPCMPEQQ Y3, Y2, Y2 VMOVMSKPD Y2, DI CMPQ DI, $0x0f CMOVQEQ R14, DI CMOVQNE R15, DI ADDQ SI, DI VMOVDQU Y1, (BX) VMOVDQU Y3, (BX)(SI*1) ADDQ DI, BX ADDQ $0x40, DX SUBQ $0x40, AX avx2_tail32: CMPQ AX, $0x20 JL avx2_tail VMOVDQU (DX), Y0 VMOVDQU 32(DX), Y1 VPCMPEQQ Y1, Y0, Y0 VMOVMSKPD Y0, SI CMPQ SI, $0x0f CMOVQEQ R14, SI CMOVQNE R15, SI VMOVDQU Y1, (BX) ADDQ SI, BX ADDQ $0x20, DX SUBQ $0x20, AX avx2_tail: VZEROUPPER JMP tail golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/dedupe_default.go000066400000000000000000000010071452252572700271430ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package sortedset func dedupe1(dst, src []byte) int { return dedupeGeneric(dst, src, 1) } func dedupe2(dst, src []byte) int { return dedupeGeneric(dst, src, 2) } func dedupe4(dst, src []byte) int { return dedupeGeneric(dst, src, 4) } func dedupe8(dst, src []byte) int { return dedupeGeneric(dst, src, 8) } func dedupe16(dst, src []byte) int { return dedupeGeneric(dst, src, 16) } func dedupe32(dst, src []byte) int { return dedupeGeneric(dst, src, 32) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/dedupe_test.go000066400000000000000000000041511452252572700265010ustar00rootroot00000000000000package sortedset import ( "fmt" "math/rand" "testing" ) var dedupeSpecializationSizes = []int{4, 8, 16, 32} var repeatChances = []float64{0, 0.1, 0.5, 1.0} func TestDedupe(t *testing.T) { for _, size := range []int{1, 2, 3, 4, 8, 10, 16, 32} { makeArray := func(items ...byte) []byte { array := make([]byte, len(items)*size) for i := range items { array[i*size] = items[i] } return array } for _, test := range []struct { name string b []byte expect []byte }{ { name: "empty", }, { name: "all dupes", b: makeArray(1, 1, 1, 1, 1, 1, 1, 1), expect: makeArray(1), }, { name: "no dupes", b: makeArray(1, 2, 3, 4, 5, 6, 7, 8), expect: makeArray(1, 2, 3, 4, 5, 6, 7, 8), }, { name: "some dupes", b: makeArray(0, 0, 0, 1, 1, 2, 3, 3, 4, 4, 4), expect: makeArray(0, 1, 2, 3, 4), }, } { t.Run(fmt.Sprintf("size %d, %s", size, test.name), func(t *testing.T) { actual := Dedupe(nil, test.b, size) assertArraysEqual(t, test.expect, actual, size) }) } } // Test the specializations. for _, size := range dedupeSpecializationSizes { t.Run(fmt.Sprintf("size %d, random", size), func(t *testing.T) { const maxCount = 100 const iterations = 1000 prng := rand.New(rand.NewSource(0)) for i := 0; i < iterations; i++ { count := prng.Intn(maxCount) for _, p := range repeatChances { array, uniques := randomSortedArray(prng, size, count, p) result := Dedupe(nil, array, size) assertArraysEqual(t, uniques, result, size) } } }) } } func BenchmarkDedupe(b *testing.B) { for _, size := range dedupeSpecializationSizes { for _, p := range repeatChances { b.Run(fmt.Sprintf("size %d, with %d%% chance of repeat", size, int(p*100)), func(b *testing.B) { const bytes = 64 * 1024 prng := rand.New(rand.NewSource(0)) src, _ := randomSortedArray(prng, size, bytes/size, p) buf := make([]byte, len(src)) b.SetBytes(bytes) b.ResetTimer() for i := 0; i < b.N; i++ { //copy(buf, src) _ = Dedupe(buf, src, size) } }) } } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/intersect.go000066400000000000000000000021621452252572700261740ustar00rootroot00000000000000package sortedset import ( "bytes" "github.com/segmentio/asm/cpu" "github.com/segmentio/asm/cpu/x86" "github.com/segmentio/asm/internal" ) func Intersect(dst, a, b []byte, size int) []byte { if len(a) == 0 || len(b) == 0 { return dst[:0] } if size <= 0 || !internal.PairMultipleOf(size, len(a), len(b)) { panic("input lengths must be a multiple of size") } if cap(dst) < len(a) && cap(dst) < len(b) { panic("cap(dst) < min(len(a),len(b))") } // Fast paths for non-overlapping sets. if bytes.Compare(a[len(a)-size:], b[:size]) < 0 || bytes.Compare(b[len(b)-size:], a[:size]) < 0 { return dst[:0] } var pos int switch { case size == 16 && cpu.X86.Has(x86.AVX): pos = intersect16(dst, a, b) default: pos = intersectGeneric(dst, a, b, size) } return dst[:pos] } func intersectGeneric(dst, a, b []byte, size int) int { i, j, k := 0, 0, 0 for i < len(a) && j < len(b) { itemA := a[i : i+size] itemB := b[j : j+size] switch bytes.Compare(itemA, itemB) { case 0: copy(dst[k:k+size], itemA) i += size j += size k += size case -1: i += size case 1: j += size } } return k } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/intersect16_amd64.go000066400000000000000000000004011452252572700273300ustar00rootroot00000000000000// Code generated by command: go run intersect16_asm.go -pkg sortedset -out ../sortedset/intersect16_amd64.s -stubs ../sortedset/intersect16_amd64.go. DO NOT EDIT. //go:build !purego package sortedset func intersect16(dst []byte, a []byte, b []byte) int golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/intersect16_amd64.s000066400000000000000000000023231452252572700271720ustar00rootroot00000000000000// Code generated by command: go run intersect16_asm.go -pkg sortedset -out ../sortedset/intersect16_amd64.s -stubs ../sortedset/intersect16_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func intersect16(dst []byte, a []byte, b []byte) int // Requires: AVX TEXT ·intersect16(SB), NOSPLIT, $0-80 MOVQ dst_base+0(FP), AX MOVQ a_base+24(FP), CX MOVQ b_base+48(FP), DX MOVQ a_len+32(FP), BX ADDQ CX, BX MOVQ b_len+56(FP), SI ADDQ DX, SI VPCMPEQB X0, X0, X0 VMOVUPS (CX), X1 VMOVUPS (DX), X2 loop: VPCMPEQB X1, X2, X3 VPXOR X3, X0, X3 VPMINUB X1, X2, X4 VPCMPEQB X1, X4, X4 VPAND X4, X3, X4 VPMOVMSKB X3, DI VPMOVMSKB X4, R8 TESTL DI, DI JZ equal BSFL DI, R9 BTSL R9, R8 JCS less ADDQ $0x10, DX CMPQ DX, SI JE done VMOVUPS (DX), X2 JMP loop less: ADDQ $0x10, CX CMPQ CX, BX JE done VMOVUPS (CX), X1 JMP loop equal: VMOVUPS X1, (AX) ADDQ $0x10, AX ADDQ $0x10, CX ADDQ $0x10, DX CMPQ CX, BX JE done CMPQ DX, SI JE done VMOVUPS (CX), X1 VMOVUPS (DX), X2 JMP loop done: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+72(FP) RET golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/intersect_test.go000066400000000000000000000074411452252572700272400ustar00rootroot00000000000000package sortedset import ( "bytes" "fmt" "math/rand" "testing" ) var intersectSpecializationSizes = []int{16} func TestIntersect(t *testing.T) { for _, test := range []struct { name string a []byte b []byte size int expect []byte }{ { name: "empty", size: 1, }, { name: "size 1, empty a", a: nil, b: []byte{1, 2, 3, 4, 5}, size: 1, expect: nil, }, { name: "size 1, empty b", a: []byte{1, 2, 3, 4, 5}, b: nil, size: 1, expect: nil, }, { name: "size 1, a == b", a: []byte{1, 2, 3, 4, 5}, b: []byte{1, 2, 3, 4, 5}, size: 1, expect: []byte{1, 2, 3, 4, 5}, }, { name: "size 1, a < b", a: []byte{1, 2, 3}, b: []byte{4, 5, 6}, size: 1, expect: nil, }, { name: "size 1, b < a", a: []byte{4, 5, 6}, b: []byte{1, 2, 3}, size: 1, expect: nil, }, { name: "size 1, a <= b", a: []byte{1, 2, 3}, b: []byte{3, 4, 5}, size: 1, expect: []byte{3}, }, { name: "size 1, b <= a", a: []byte{3, 4, 5}, b: []byte{1, 2, 3}, size: 1, expect: []byte{3}, }, { name: "size 1, interleaved 1", a: []byte{1, 3, 5}, b: []byte{2, 4, 6}, size: 1, expect: nil, }, { name: "size 1, interleaved 2", a: []byte{2, 4, 6}, b: []byte{1, 3, 5}, size: 1, expect: nil, }, { name: "size 1, overlapping 1", a: []byte{1, 2, 3, 4, 5, 6}, b: []byte{2, 4, 6, 8}, size: 1, expect: []byte{2, 4, 6}, }, { name: "size 1, overlapping 2", a: []byte{2, 3, 4, 5}, b: []byte{1, 3, 5, 7}, size: 1, expect: []byte{3, 5}, }, } { t.Run(test.name, func(t *testing.T) { buf := make([]byte, len(test.a)+len(test.b)) actual := Intersect(buf, test.a, test.b, test.size) assertArraysEqual(t, test.expect, actual, test.size) }) } // Test the specializations. for _, size := range intersectSpecializationSizes { t.Run(fmt.Sprintf("size %d, random", size), func(t *testing.T) { const maxCount = 100 const iterations = 1000 prng := rand.New(rand.NewSource(0)) buf := make([]byte, size*maxCount*2) for i := 0; i < iterations; i++ { count := prng.Intn(maxCount) for _, p := range overlapChances { setA, setB := randomSortedSetPair(prng, size, count, p) actual := Intersect(buf[:0], setA, setB, size) // Manual intersection on a sorted array: combined := combineArrays(setA, setB, size) expected := buf[:0] if len(combined) > 0 { prev := combined[:size] for i := size; i < len(combined); i += size { item := combined[i : i+size] if bytes.Equal(item, prev) { expected = append(expected, item...) } prev = item } } assertArraysEqual(t, expected, actual, size) } } }) } } func BenchmarkIntersect(b *testing.B) { for _, size := range intersectSpecializationSizes { for _, p := range overlapChances { b.Run(fmt.Sprintf("size %d, with %d%% chance of overlap", size, int(p*100)), func(b *testing.B) { const bytes = 64 * 1024 prng := rand.New(rand.NewSource(0)) setA, setB := randomSortedSetPair(prng, size, bytes/size, p) buf := make([]byte, bytes*2) b.SetBytes(int64(bytes * 2)) b.ResetTimer() for i := 0; i < b.N; i++ { Intersect(buf[:0], setA, setB, size) } }) } } b.Run("no overlap", func(b *testing.B) { prng := rand.New(rand.NewSource(0)) array, _ := randomSortedArray(prng, 16, 128, 0.0) dst := make([]byte, 16*64) b.ResetTimer() for i := 0; i < b.N; i++ { Intersect(dst, array[:64*16], array[64*16:], 16) } }) b.Run("empty", func(b *testing.B) { for i := 0; i < b.N; i++ { Intersect(nil, nil, nil, 16) } }) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/sortedset_default.go000066400000000000000000000003611452252572700277130ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package sortedset func intersect16(dst, a, b []byte) int { return intersectGeneric(dst, a, b, 16) } func union16(dst, a, b []byte) (i, j, k int) { return unionGeneric(dst, a, b, 16) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/sortedset_test.go000066400000000000000000000054531452252572700272550ustar00rootroot00000000000000package sortedset import ( "bytes" "encoding/hex" "math/rand" "sort" "testing" ) func assertArraysEqual(t *testing.T, expected, actual []byte, size int) { t.Helper() if !bytes.Equal(expected, actual) { t.Logf("\nexpected (%d):\n%s\nfound (%d):\n%s", len(expected), hex.Dump(expected), len(actual), hex.Dump(actual)) t.Fatal("arrays are not equal") } } func randomSortedArray(prng *rand.Rand, size int, count int, repeatChance float64) (array []byte, uniques []byte) { if count == 0 { return nil, nil } // Generate `count` random chunks of `size` bytes and then sort them. pool := make([]byte, size*count) prng.Read(pool) sortArray(pool, size) // Sanity checks — the items must be unique and sorted. for i := size; i < len(pool); i += size { switch bytes.Compare(pool[i-size:i], pool[i:i+size]) { case 0: panic("duplicate item in pool") case 1: panic("not sorted correctly") } } array = make([]byte, 0, size*count) // Build an array from the pool of unique items, using the configurable // chance of repeat. A repeatChance of 0 will yield an array where every // item is unique, while a repeatChance of 1 will yield an array where // every item is a duplicate of the first item. uniq := size for i := 0; i < count; i++ { array = append(array, pool[uniq-size:uniq]...) if prng.Float64() >= repeatChance && i != count-1 { uniq += size } } // Return a second array with just the unique items. uniques = pool[:uniq] return } func randomSortedSet(prng *rand.Rand, size int, count int) []byte { _, set := randomSortedArray(prng, size, count, 0.0) return set } func randomSortedSetPair(prng *rand.Rand, size int, count int, overlapChance float64) ([]byte, []byte) { setA := randomSortedSet(prng, size, count) setB := randomSortedSet(prng, size, count) // Sanity check: there must be no duplicates. if len(combineArrays(setA, setB, size)) != count*size*2 { panic("sorted sets overlap") } // Build a new set by taking items from both setA and setB depending // on the value of overlapChance. split := int(float64(count)*overlapChance) * size overlap := combineArrays(setA[:split], setB[:len(setB)-split], size) return setA, overlap } func combineArrays(a, b []byte, size int) []byte { return sortArray(append(append([]byte{}, a...), b...), size) } func sortArray(b []byte, size int) []byte { sort.Sort(&chunks{b: b, size: size}) return b } type chunks struct { b []byte size int tmp []byte } func (s *chunks) Len() int { return len(s.b) / s.size } func (s *chunks) Less(i, j int) bool { return bytes.Compare(s.slice(i), s.slice(j)) < 0 } func (s *chunks) Swap(i, j int) { tmp := make([]byte, s.size) copy(tmp, s.slice(j)) copy(s.slice(j), s.slice(i)) copy(s.slice(i), tmp) } func (s *chunks) slice(i int) []byte { return s.b[i*s.size : (i+1)*s.size] } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/union.go000066400000000000000000000027601452252572700253300ustar00rootroot00000000000000package sortedset import ( "bytes" "github.com/segmentio/asm/cpu" "github.com/segmentio/asm/cpu/x86" "github.com/segmentio/asm/internal" ) func Union(dst, a, b []byte, size int) []byte { if size <= 0 || !internal.PairMultipleOf(size, len(a), len(b)) { panic("input lengths must be a multiple of size") } if cap(dst) < len(a)+len(b) { panic("cap(dst) < len(a)+len(b)") } // Fast paths for non-overlapping sets. switch { case len(a) == 0: return dst[:copy(dst[:cap(dst)], b)] case len(b) == 0: return dst[:copy(dst[:cap(dst)], a)] case bytes.Compare(a[len(a)-size:], b[:size]) < 0: k := copy(dst[:len(a)], a) k += copy(dst[k:k+len(b)], b) return dst[:k] case bytes.Compare(b[len(b)-size:], a[:size]) < 0: k := copy(dst[:len(b)], b) k += copy(dst[k:k+len(a)], a) return dst[:k] } i, j, k := 0, 0, 0 switch { case size == 16 && cpu.X86.Has(x86.AVX): i, j, k = union16(dst, a, b) default: i, j, k = unionGeneric(dst, a, b, size) } if i < len(a) { k += copy(dst[k:k+len(a)-i], a[i:]) } else if j < len(b) { k += copy(dst[k:k+len(b)-j], b[j:]) } return dst[:k] } func unionGeneric(dst, a, b []byte, size int) (i, j, k int) { i, j, k = 0, 0, 0 for i < len(a) && j < len(b) { itemA := a[i : i+size] itemB := b[j : j+size] switch bytes.Compare(itemA, itemB) { case 0: copy(dst[k:k+size], itemA) i += size j += size case -1: copy(dst[k:k+size], itemA) i += size case 1: copy(dst[k:k+size], itemB) j += size } k += size } return } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/union16_amd64.go000066400000000000000000000004031452252572700264620ustar00rootroot00000000000000// Code generated by command: go run union16_asm.go -pkg sortedset -out ../sortedset/union16_amd64.s -stubs ../sortedset/union16_amd64.go. DO NOT EDIT. //go:build !purego package sortedset func union16(dst []byte, a []byte, b []byte) (i int, j int, k int) golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/union16_amd64.s000066400000000000000000000026151452252572700263260ustar00rootroot00000000000000// Code generated by command: go run union16_asm.go -pkg sortedset -out ../sortedset/union16_amd64.s -stubs ../sortedset/union16_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func union16(dst []byte, a []byte, b []byte) (i int, j int, k int) // Requires: AVX TEXT ·union16(SB), NOSPLIT, $0-96 MOVQ dst_base+0(FP), AX MOVQ a_base+24(FP), CX MOVQ b_base+48(FP), DX MOVQ a_len+32(FP), BX ADDQ CX, BX MOVQ b_len+56(FP), SI ADDQ DX, SI VPCMPEQB X0, X0, X0 VMOVUPS (CX), X1 VMOVUPS (DX), X2 loop: VPCMPEQB X1, X2, X3 VPXOR X3, X0, X3 VPMINUB X1, X2, X4 VPCMPEQB X1, X4, X4 VPAND X4, X3, X4 VPMOVMSKB X3, DI VPMOVMSKB X4, R8 TESTL DI, DI JZ equal BSFL DI, R9 BTSL R9, R8 JCS less VMOVUPS X2, (AX) ADDQ $0x10, AX ADDQ $0x10, DX CMPQ DX, SI JE done VMOVUPS (DX), X2 JMP loop less: VMOVUPS X1, (AX) ADDQ $0x10, AX ADDQ $0x10, CX CMPQ CX, BX JE done VMOVUPS (CX), X1 JMP loop equal: VMOVUPS X1, (AX) ADDQ $0x10, AX ADDQ $0x10, CX ADDQ $0x10, DX CMPQ CX, BX JE done CMPQ DX, SI JE done VMOVUPS (CX), X1 VMOVUPS (DX), X2 JMP loop done: MOVQ a_base+24(FP), BX SUBQ BX, CX MOVQ CX, i+72(FP) MOVQ b_base+48(FP), CX SUBQ CX, DX MOVQ DX, j+80(FP) MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, k+88(FP) RET golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/sortedset/union_test.go000066400000000000000000000057171452252572700263740ustar00rootroot00000000000000package sortedset import ( "fmt" "math/rand" "testing" ) var unionSpecializationSizes = []int{16} var overlapChances = []float64{0, 0.1, 0.5, 1.0} func TestUnion(t *testing.T) { for _, test := range []struct { name string a []byte b []byte size int expect []byte }{ { name: "empty", size: 1, }, { name: "size 1, empty a", a: nil, b: []byte{1, 2, 3, 4, 5}, size: 1, expect: []byte{1, 2, 3, 4, 5}, }, { name: "size 1, empty b", a: []byte{1, 2, 3, 4, 5}, b: nil, size: 1, expect: []byte{1, 2, 3, 4, 5}, }, { name: "size 1, a == b", a: []byte{1, 2, 3, 4, 5}, b: []byte{1, 2, 3, 4, 5}, size: 1, expect: []byte{1, 2, 3, 4, 5}, }, { name: "size 1, a < b", a: []byte{1, 2, 3}, b: []byte{4, 5, 6}, size: 1, expect: []byte{1, 2, 3, 4, 5, 6}, }, { name: "size 1, b < a", a: []byte{4, 5, 6}, b: []byte{1, 2, 3}, size: 1, expect: []byte{1, 2, 3, 4, 5, 6}, }, { name: "size 1, a <= b", a: []byte{1, 2, 3}, b: []byte{3, 4, 5}, size: 1, expect: []byte{1, 2, 3, 4, 5}, }, { name: "size 1, b <= a", a: []byte{3, 4, 5}, b: []byte{1, 2, 3}, size: 1, expect: []byte{1, 2, 3, 4, 5}, }, { name: "size 1, interleaved 1", a: []byte{1, 3, 5}, b: []byte{2, 4, 6}, size: 1, expect: []byte{1, 2, 3, 4, 5, 6}, }, { name: "size 1, interleaved 2", a: []byte{2, 4, 6}, b: []byte{1, 3, 5}, size: 1, expect: []byte{1, 2, 3, 4, 5, 6}, }, } { t.Run(test.name, func(t *testing.T) { buf := make([]byte, len(test.a)+len(test.b)) actual := Union(buf, test.a, test.b, test.size) assertArraysEqual(t, test.expect, actual, test.size) }) } // Test the specializations. for _, size := range unionSpecializationSizes { t.Run(fmt.Sprintf("size %d, random", size), func(t *testing.T) { const maxCount = 100 const iterations = 1000 prng := rand.New(rand.NewSource(0)) buf := make([]byte, size*maxCount*2) for i := 0; i < iterations; i++ { count := prng.Intn(maxCount) for _, p := range overlapChances { setA, setB := randomSortedSetPair(prng, size, count, p) actual := Union(buf[:0], setA, setB, size) expected := Dedupe(nil, combineArrays(setA, setB, size), size) assertArraysEqual(t, expected, actual, size) } } }) } } func BenchmarkUnion(b *testing.B) { for _, size := range unionSpecializationSizes { for _, p := range overlapChances { b.Run(fmt.Sprintf("size %d, with %d%% chance of overlap", size, int(p*100)), func(b *testing.B) { const bytes = 64 * 1024 prng := rand.New(rand.NewSource(0)) setA, setB := randomSortedSetPair(prng, size, bytes/size, p) buf := make([]byte, bytes*2) b.SetBytes(int64(bytes * 2)) b.ResetTimer() for i := 0; i < b.N; i++ { Union(buf[:0], setA, setB, size) } }) } } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/000077500000000000000000000000001452252572700225165ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/cmd/000077500000000000000000000000001452252572700232615ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/cmd/valid/000077500000000000000000000000001452252572700243605ustar00rootroot00000000000000golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/cmd/valid/README.md000066400000000000000000000031531452252572700256410ustar00rootroot00000000000000# valid This program is a helper to check the output of `utf8.Valid` facilitate debugging. It accepts some input, runs both this library and stdlib's version of `utf8.Valid`, and prints out the result. ## Usage Provide the input as the the first argument to the program: ``` $ go run main.go "hello! 😊" hello! 😊 [104 101 108 108 111 33 32 240 159 152 138] 11 bytes stdlib: utf8: true ascii: false valid: utf8: true ascii: false v: 1 ``` The input is parsed as a double quoted Go string, so you can use escape codes: ``` $ go run main.go "\xFA" [250] 1 bytes stdlib: utf8: false ascii: false valid: utf8: false ascii: false v: 0 ``` Alternatively it can also conusme input from stdin: ``` $ cat example.txt hello! 😊 $ go run main.go < example.txt hello! 😊 [104 101 108 108 111 33 32 240 159 152 138 10] 12 bytes stdlib: utf8: true ascii: false valid: utf8: true ascii: false v: 1 ``` As a bonus, if the file is the result of a failure reported by Go 1.18 fuzz, the program extracts the actual value of the test: ``` $ cat fuzz.out go test fuzz []byte("000000000000000000~\xFF") $ go run main.go < fuzz.out Got fuzzer input 000000000000000000~ [48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 126 255] 20 bytes stdlib: utf8: false ascii: false valid: utf8: false ascii: false v: 0 ``` ## GDB A useful way to debug is to run this program with some problematic input and use GDB to step through the execution and inspect registers. The `debug.gdb` file is a basic helper to automate part of the process. For example: ``` $ go build main.go && gdb --command=debug.gdb -ex "set args < ./example.txt" ./main ``` golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/cmd/valid/debug.gdb000066400000000000000000000001361452252572700261240ustar00rootroot00000000000000tui enable tui reg all b github.com/segmentio/asm/utf8.validateAvx commands 1 b +4 c end r golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/cmd/valid/main.go000066400000000000000000000022571452252572700256410ustar00rootroot00000000000000package main import ( "fmt" "io/ioutil" "os" "regexp" "strconv" "strings" stdlib "unicode/utf8" "github.com/segmentio/asm/ascii" "github.com/segmentio/asm/utf8" ) func main() { var data []byte if len(os.Args) > 1 { s := os.Args[1] s, err := strconv.Unquote(`"` + s + `"`) if err != nil { panic(err) } data = []byte(s) } else { var err error data, err = ioutil.ReadAll(os.Stdin) if err != nil { panic(err) } } s := string(data) lines := strings.Split(s, "\n") if len(lines) > 0 && strings.HasPrefix(lines[0], "go test fuzz") { fmt.Println("Got fuzzer input") // TODO: parse with go/parse instead of regexp? r := regexp.MustCompile(`^\[\]byte\((.+)\)`) results := r.FindStringSubmatch(lines[1]) s, err := strconv.Unquote(results[1]) if err != nil { panic(err) } data = []byte(s) } fmt.Println(string(data)) fmt.Println(data) fmt.Println(len(data), "bytes") uref := stdlib.Valid(data) aref := ascii.Valid(data) fmt.Println("stdlib: utf8:", uref, "ascii:", aref) v := utf8.Validate(data) fmt.Println("valid: utf8:", v.IsUTF8(), "ascii:", v.IsASCII(), "v:", v) if uref != v.IsUTF8() || aref != v.IsASCII() { os.Exit(1) } } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/utf8.go000066400000000000000000000002751452252572700237370ustar00rootroot00000000000000package utf8 import _ "github.com/segmentio/asm/cpu" // Valid reports whether p consists entirely of valid UTF-8-encoded runes. func Valid(p []byte) bool { return Validate(p).IsUTF8() } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/valid.go000066400000000000000000000007501452252572700241460ustar00rootroot00000000000000package utf8 import ( "unicode/utf8" "github.com/segmentio/asm/ascii" ) type Validation byte const ( Invalid = 0 UTF8 = 0b01 ASCII = 0b10 | UTF8 ) func (v Validation) IsASCII() bool { return (v & ASCII) == ASCII } func (v Validation) IsUTF8() bool { return (v & UTF8) == UTF8 } func (v Validation) IsInvalid() bool { return v == Invalid } func validate(p []byte) Validation { if ascii.Valid(p) { return ASCII } if utf8.Valid(p) { return UTF8 } return Invalid } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/valid_amd64.go000066400000000000000000000004041452252572700251350ustar00rootroot00000000000000// Code generated by command: go run valid_asm.go -pkg utf8 -out ../utf8/valid_amd64.s -stubs ../utf8/valid_amd64.go. DO NOT EDIT. //go:build !purego package utf8 // Optimized version of Validate for inputs of more than 32B. func validateAvx(p []byte) byte golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/valid_amd64.s000066400000000000000000000157511452252572700250050ustar00rootroot00000000000000// Code generated by command: go run valid_asm.go -pkg utf8 -out ../utf8/valid_amd64.s -stubs ../utf8/valid_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func validateAvx(p []byte) byte // Requires: AVX, AVX2 TEXT ·validateAvx(SB), NOSPLIT, $0-25 MOVQ p_base+0(FP), AX MOVQ p_len+8(FP), CX MOVB $0x01, DL // Prepare the constant masks VMOVDQU incomplete_mask<>+0(SB), Y0 VMOVDQU cont4_vec<>+0(SB), Y1 VMOVDQU cont3_vec<>+0(SB), Y2 // High nibble of current byte VMOVDQU nibble1_errors<>+0(SB), Y3 // Low nibble of current byte VMOVDQU nibble2_errors<>+0(SB), Y4 // High nibble of the next byte VMOVDQU nibble3_errors<>+0(SB), Y5 // Nibble mask VMOVDQU nibble_mask<>+0(SB), Y6 // MSB mask VMOVDQU msb_mask<>+0(SB), Y7 // For the first pass, set the previous block as zero. VXORPS Y8, Y8, Y8 // Zeroes the error vector. VXORPS Y9, Y9, Y9 // Zeroes the "previous block was incomplete" vector. VXORPS Y10, Y10, Y10 // Top of the loop. check_input: // if bytes left >= 32 CMPQ CX, $0x20 // go process the next block JGE process // If < 32 bytes left // Fast exit if done CMPQ CX, $0x00 JE end // If 0 < bytes left < 32 VPXOR Y12, Y12, Y12 MOVQ $0x0000000000000020, BX SUBQ CX, BX SUBQ BX, AX VMOVDQU (AX), Y11 CMPQ CX, $0x10 JA tail_load_large // Shift right that works if remaining bytes <= 16, safe next to a page boundary VPERM2I128 $0x03, Y11, Y12, Y11 LEAQ shuffle_clear_mask<>+16(SB), SI ADDQ CX, BX ADDQ CX, BX SUBQ $0x20, BX SUBQ BX, SI VMOVDQU (SI), Y13 VPSHUFB Y13, Y11, Y11 XORQ CX, CX JMP loaded // Shift right that works if remaining bytes >= 16, safe next to a page boundary tail_load_large: ADDQ CX, BX ADDQ CX, BX SUBQ $0x30, BX LEAQ shuffle_mask<>+16(SB), SI SUBQ BX, SI VMOVDQU (SI), Y13 VPSHUFB Y13, Y11, Y14 VPERM2I128 $0x03, Y11, Y12, Y11 VPSHUFB Y13, Y11, Y11 LEAQ blend_mask<>+16(SB), CX SUBQ BX, CX VBROADCASTF128 (CX), Y12 VPBLENDVB Y12, Y14, Y11, Y11 XORQ CX, CX JMP loaded // Process one 32B block of data process: // Load the next block of bytes VMOVDQU (AX), Y11 SUBQ $0x20, CX ADDQ $0x20, AX loaded: // Fast check to see if ASCII VPMOVMSKB Y11, BX CMPL BX, $0x00 JNZ non_ascii // If this whole block is ASCII, there is nothing to do, and it is an error if any of the previous code point was incomplete. VPOR Y9, Y10, Y9 JMP check_input non_ascii: XORB DL, DL // Prepare intermediate vector for push operations VPERM2I128 $0x03, Y8, Y11, Y8 // Check errors on the high nibble of the previous byte VPALIGNR $0x0f, Y8, Y11, Y10 VPSRLW $0x04, Y10, Y12 VPAND Y12, Y6, Y12 VPSHUFB Y12, Y3, Y12 // Check errors on the low nibble of the previous byte VPAND Y10, Y6, Y10 VPSHUFB Y10, Y4, Y10 VPAND Y10, Y12, Y12 // Check errors on the high nibble on the current byte VPSRLW $0x04, Y11, Y10 VPAND Y10, Y6, Y10 VPSHUFB Y10, Y5, Y10 VPAND Y10, Y12, Y12 // Find 3 bytes continuations VPALIGNR $0x0e, Y8, Y11, Y10 VPSUBUSB Y2, Y10, Y10 // Find 4 bytes continuations VPALIGNR $0x0d, Y8, Y11, Y8 VPSUBUSB Y1, Y8, Y8 // Combine them to have all continuations VPOR Y10, Y8, Y8 // Perform a byte-sized signed comparison with zero to turn any non-zero bytes into 0xFF. VXORPS Y10, Y10, Y10 VPCMPGTB Y10, Y8, Y8 // Find bytes that are continuations by looking at their most significant bit. VPAND Y7, Y8, Y8 // Find mismatches between expected and actual continuation bytes VPXOR Y8, Y12, Y8 // Store result in sticky error VPOR Y9, Y8, Y9 // Prepare for next iteration VPSUBUSB Y0, Y11, Y10 VMOVDQU Y11, Y8 // End of loop JMP check_input end: // If the previous block was incomplete, this is an error. VPOR Y10, Y9, Y9 // Return whether any error bit was set VPTEST Y9, Y9 SETEQ AL // Bit 0 tells if the input is valid utf8, bit 1 tells if it's valid ascii ANDB AL, DL SHLB $0x01, DL ORB DL, AL MOVB AL, ret+24(FP) VZEROUPPER RET DATA incomplete_mask<>+0(SB)/8, $0xffffffffffffffff DATA incomplete_mask<>+8(SB)/8, $0xffffffffffffffff DATA incomplete_mask<>+16(SB)/8, $0xffffffffffffffff DATA incomplete_mask<>+24(SB)/8, $0xbfdfefffffffffff GLOBL incomplete_mask<>(SB), RODATA|NOPTR, $32 DATA cont4_vec<>+0(SB)/8, $0xefefefefefefefef DATA cont4_vec<>+8(SB)/8, $0xefefefefefefefef DATA cont4_vec<>+16(SB)/8, $0xefefefefefefefef DATA cont4_vec<>+24(SB)/8, $0xefefefefefefefef GLOBL cont4_vec<>(SB), RODATA|NOPTR, $32 DATA cont3_vec<>+0(SB)/8, $0xdfdfdfdfdfdfdfdf DATA cont3_vec<>+8(SB)/8, $0xdfdfdfdfdfdfdfdf DATA cont3_vec<>+16(SB)/8, $0xdfdfdfdfdfdfdfdf DATA cont3_vec<>+24(SB)/8, $0xdfdfdfdfdfdfdfdf GLOBL cont3_vec<>(SB), RODATA|NOPTR, $32 DATA nibble1_errors<>+0(SB)/8, $0x0202020202020202 DATA nibble1_errors<>+8(SB)/8, $0x4915012180808080 DATA nibble1_errors<>+16(SB)/8, $0x0202020202020202 DATA nibble1_errors<>+24(SB)/8, $0x4915012180808080 GLOBL nibble1_errors<>(SB), RODATA|NOPTR, $32 DATA nibble2_errors<>+0(SB)/8, $0xcbcbcb8b8383a3e7 DATA nibble2_errors<>+8(SB)/8, $0xcbcbdbcbcbcbcbcb DATA nibble2_errors<>+16(SB)/8, $0xcbcbcb8b8383a3e7 DATA nibble2_errors<>+24(SB)/8, $0xcbcbdbcbcbcbcbcb GLOBL nibble2_errors<>(SB), RODATA|NOPTR, $32 DATA nibble3_errors<>+0(SB)/8, $0x0101010101010101 DATA nibble3_errors<>+8(SB)/8, $0x01010101babaaee6 DATA nibble3_errors<>+16(SB)/8, $0x0101010101010101 DATA nibble3_errors<>+24(SB)/8, $0x01010101babaaee6 GLOBL nibble3_errors<>(SB), RODATA|NOPTR, $32 DATA nibble_mask<>+0(SB)/8, $0x0f0f0f0f0f0f0f0f DATA nibble_mask<>+8(SB)/8, $0x0f0f0f0f0f0f0f0f DATA nibble_mask<>+16(SB)/8, $0x0f0f0f0f0f0f0f0f DATA nibble_mask<>+24(SB)/8, $0x0f0f0f0f0f0f0f0f GLOBL nibble_mask<>(SB), RODATA|NOPTR, $32 DATA msb_mask<>+0(SB)/8, $0x8080808080808080 DATA msb_mask<>+8(SB)/8, $0x8080808080808080 DATA msb_mask<>+16(SB)/8, $0x8080808080808080 DATA msb_mask<>+24(SB)/8, $0x8080808080808080 GLOBL msb_mask<>(SB), RODATA|NOPTR, $32 DATA shuffle_mask<>+0(SB)/8, $0x0706050403020100 DATA shuffle_mask<>+8(SB)/8, $0x0f0e0d0c0b0a0908 DATA shuffle_mask<>+16(SB)/8, $0x0706050403020100 DATA shuffle_mask<>+24(SB)/8, $0x0f0e0d0c0b0a0908 DATA shuffle_mask<>+32(SB)/8, $0x0706050403020100 DATA shuffle_mask<>+40(SB)/8, $0x0f0e0d0c0b0a0908 GLOBL shuffle_mask<>(SB), RODATA|NOPTR, $48 DATA shuffle_clear_mask<>+0(SB)/8, $0x0706050403020100 DATA shuffle_clear_mask<>+8(SB)/8, $0x0f0e0d0c0b0a0908 DATA shuffle_clear_mask<>+16(SB)/8, $0xffffffffffffffff DATA shuffle_clear_mask<>+24(SB)/8, $0xffffffffffffffff DATA shuffle_clear_mask<>+32(SB)/8, $0xffffffffffffffff DATA shuffle_clear_mask<>+40(SB)/8, $0xffffffffffffffff GLOBL shuffle_clear_mask<>(SB), RODATA|NOPTR, $48 DATA blend_mask<>+0(SB)/8, $0xffffffffffffffff DATA blend_mask<>+8(SB)/8, $0xffffffffffffffff DATA blend_mask<>+16(SB)/8, $0x0000000000000000 DATA blend_mask<>+24(SB)/8, $0x0000000000000000 DATA blend_mask<>+32(SB)/8, $0xffffffffffffffff DATA blend_mask<>+40(SB)/8, $0xffffffffffffffff GLOBL blend_mask<>(SB), RODATA|NOPTR, $48 golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/valid_default.go000066400000000000000000000003471452252572700256540ustar00rootroot00000000000000//go:build purego || !amd64 // +build purego !amd64 package utf8 // Validate is a more precise version of Valid that also indicates whether the // input was valid ASCII. func Validate(p []byte) Validation { return validate(p) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/valid_go18_test.go000066400000000000000000000007421452252572700260440ustar00rootroot00000000000000//go:build go1.18 // +build go1.18 package utf8 import ( "testing" stdlib "unicode/utf8" "github.com/segmentio/asm/ascii" ) func FuzzValid(f *testing.F) { f.Fuzz(func(t *testing.T, data []byte) { v := Validate(data) ru := stdlib.Valid(data) if ru != v.IsUTF8() { t.Errorf("Validate(%q) UTF8 = %v; want %v", data, v.IsUTF8(), ru) } ra := ascii.Valid(data) if ra != v.IsASCII() { t.Errorf("Validate(%q) ASCII = %v; want %v", data, v.IsASCII(), ra) } }) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/valid_support_amd64.go000066400000000000000000000006301452252572700267320ustar00rootroot00000000000000//go:build !purego // +build !purego package utf8 import ( "github.com/segmentio/asm/cpu" "github.com/segmentio/asm/cpu/x86" ) var noAVX2 = !cpu.X86.Has(x86.AVX2) // Validate is a more precise version of Valid that also indicates whether the // input was valid ASCII. func Validate(p []byte) Validation { if noAVX2 || len(p) < 32 { return validate(p) } r := validateAvx(p) return Validation(r) } golang-github-segmentio-asm-1.2.0+git20231107.1cfacc8/utf8/valid_test.go000066400000000000000000000152331452252572700252070ustar00rootroot00000000000000package utf8 import ( "bytes" "fmt" "io/ioutil" "strings" "testing" "unicode/utf8" "github.com/segmentio/asm/ascii" "github.com/segmentio/asm/internal/buffer" ) type byteRange struct { Low byte High byte } func one(b byte) byteRange { return byteRange{b, b} } func genExamples(current string, ranges []byteRange) []string { if len(ranges) == 0 { return []string{string(current)} } r := ranges[0] var all []string elements := []byte{r.Low, r.High} mid := (r.High + r.Low) / 2 if mid != r.Low && mid != r.High { elements = append(elements, mid) } for _, x := range elements { s := current + string(x) all = append(all, genExamples(s, ranges[1:])...) if x == r.High { break } } return all } func TestValid(t *testing.T) { var examples = []string{ // Tests copied from the stdlib "", "a", "abc", "Ж", "ЖЖ", "брэд-ЛГТМ", "☺☻☹", // overlong "\xE0\x80", // unfinished continuation "aa\xE2", string([]byte{66, 250}), string([]byte{66, 250, 67}), "a\uFFFDb", "\xF4\x8F\xBF\xBF", // U+10FFFF "\xF4\x90\x80\x80", // U+10FFFF+1; out of range "\xF7\xBF\xBF\xBF", // 0x1FFFFF; out of range "\xFB\xBF\xBF\xBF\xBF", // 0x3FFFFFF; out of range "\xc0\x80", // U+0000 encoded in two bytes: incorrect "\xed\xa0\x80", // U+D800 high surrogate (sic) "\xed\xbf\xbf", // U+DFFF low surrogate (sic) // valid at boundary strings.Repeat("a", 32+28) + "☺☻☹", strings.Repeat("a", 32+29) + "☺☻☹", strings.Repeat("a", 32+30) + "☺☻☹", strings.Repeat("a", 32+31) + "☺☻☹", // invalid at boundary strings.Repeat("a", 32+31) + "\xE2a", // same inputs as benchmarks "0123456789", "日本語日本語日本語日", "\xF4\x8F\xBF\xBF", // bugs found with fuzzing "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc60", "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc300", "߀0000000000000000000000000000訨", "0000000000000000000000000000000˂00000000000000000000000000000000", } any := byteRange{0, 0xFF} ascii := byteRange{0, 0x7F} cont := byteRange{0x80, 0xBF} rangesToTest := [][]byteRange{ {one(0x20), ascii, ascii, ascii}, // 2-byte sequences {one(0xC2)}, {one(0xC2), ascii}, {one(0xC2), cont}, {one(0xC2), {0xC0, 0xFF}}, {one(0xC2), cont, cont}, {one(0xC2), cont, cont, cont}, // 3-byte sequences {one(0xE1)}, {one(0xE1), cont}, {one(0xE1), cont, cont}, {one(0xE1), cont, cont, ascii}, {one(0xE1), cont, ascii}, {one(0xE1), cont, cont, cont}, // 4-byte sequences {one(0xF1)}, {one(0xF1), cont}, {one(0xF1), cont, cont}, {one(0xF1), cont, cont, cont}, {one(0xF1), cont, cont, ascii}, {one(0xF1), cont, cont, cont, ascii}, // overlong {{0xC0, 0xC1}, any}, {{0xC0, 0xC1}, any, any}, {{0xC0, 0xC1}, any, any, any}, {one(0xE0), {0x0, 0x9F}, cont}, {one(0xE0), {0xA0, 0xBF}, cont}, } for _, r := range rangesToTest { examples = append(examples, genExamples("", r)...) } for _, i := range []int{300, 316} { d := bytes.Repeat(someutf8, i/len(someutf8)) examples = append(examples, string(d)) } for _, tt := range examples { t.Run(tt, func(t *testing.T) { check(t, []byte(tt)) }) // Generate variations of the input to exercise errors at the // boundary, using the vector implementation on 32-sized input, // and on non-32-sized inputs. // // Large examples don't go through those variations because they // are likely specific tests. if len(tt) >= 32 { continue } t.Run("boundary-"+tt, func(t *testing.T) { size := 32 - len(tt) prefix := strings.Repeat("a", size) b := []byte(prefix + tt) check(t, b) }) t.Run("vec-padded-"+tt, func(t *testing.T) { prefix := strings.Repeat("a", 32) padding := strings.Repeat("b", 32-(len(tt)%32)) input := prefix + padding + tt b := []byte(input) if len(b)%32 != 0 { panic("test should generate block of 32") } check(t, b) }) t.Run("vec-"+tt, func(t *testing.T) { prefix := strings.Repeat("a", 32) input := prefix + tt if len(tt)%32 == 0 { input += "x" } b := []byte(input) if len(b)%32 == 0 { panic("test should not generate block of 32") } check(t, b) }) } } func TestValidPageBoundary(t *testing.T) { buf, err := buffer.New(64) if err != nil { t.Fatal(err) } defer buf.Release() head := buf.ProtectHead() tail := buf.ProtectTail() data := bytes.Repeat(someutf8, 64/len(someutf8)) copy(head, data) copy(tail, data) for i := 0; i <= 32; i++ { input := head[:i] check(t, input) } for i := 0; i <= 32; i++ { input := tail[i:] check(t, input) } } func check(t *testing.T, b []byte) { t.Helper() // Check that both Valid and Validate behave properly. Should not be // necessary given the definition of Valid, but just in case. expected := utf8.Valid(b) if Valid(b) != expected { err := ioutil.WriteFile("test.out.txt", b, 0600) if err != nil { panic(err) } t.Errorf("Valid(%q) = %v; want %v", string(b), !expected, expected) } v := Validate(b) if v.IsUTF8() != expected { t.Errorf("Validate(%q) utf8 valid: %v; want %v", string(b), !expected, expected) } expected = ascii.Valid(b) if v.IsASCII() != expected { t.Errorf("Validate(%q) ascii valid: %v; want %v", string(b), !expected, expected) } } var valid1k = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16) var valid1M = bytes.Repeat(valid1k, 1024) var someutf8 = []byte("\xF4\x8F\xBF\xBF") func BenchmarkValid(b *testing.B) { impls := map[string]func([]byte) bool{ "AVX": Valid, "Stdlib": utf8.Valid, } type input struct { name string data []byte } inputs := []input{ {"1kValid", valid1k}, {"1MValid", valid1M}, {"10ASCII", []byte("0123456789")}, {"10Japan", []byte("日本語日本語日本語日")}, } const KiB = 1024 const MiB = 1048576 for i := 0; i <= 400/len(someutf8); i++ { // for _, i := range []int{1 * KiB, 8 * KiB, 16 * KiB, 64 * KiB, 1 * MiB, 8 * MiB, 32 * MiB, 64 * MiB} { d := bytes.Repeat(someutf8, i) inputs = append(inputs, input{ name: fmt.Sprintf("small%d", len(d)), data: d, }) } for _, i := range []int{300, 316} { d := bytes.Repeat(someutf8, i/len(someutf8)) inputs = append(inputs, input{ name: fmt.Sprintf("tail%d", len(d)), data: d, }) } for _, input := range inputs { for implName, f := range impls { testName := fmt.Sprintf("%s/%s", input.name, implName) b.Run(testName, func(b *testing.B) { b.SetBytes(int64(len(input.data))) b.ResetTimer() for i := 0; i < b.N; i++ { f(input.data) } }) } } }