pax_global_header00006660000000000000000000000064151752230200014507gustar00rootroot0000000000000052 comment=b4a468455e40958797bc2bc8430bce8aa9013b09 golang-github-kelindar-simd-1.2.0/000077500000000000000000000000001517522302000167575ustar00rootroot00000000000000golang-github-kelindar-simd-1.2.0/.github/000077500000000000000000000000001517522302000203175ustar00rootroot00000000000000golang-github-kelindar-simd-1.2.0/.github/FUNDING.yml000066400000000000000000000000241517522302000221300ustar00rootroot00000000000000github: [kelindar] golang-github-kelindar-simd-1.2.0/.github/logo.png000066400000000000000000000351061517522302000217720ustar00rootroot00000000000000PNG  IHDRX4 pHYs.#.#x?v OiCCPPhotoshop ICC profilexڝSgTS=BKKoR RB&*! J!QEEȠQ, !{kּ> H3Q5 B.@ $pd!s#~<<+"x M0B\t8K@zB@F&S`cbP-`'{[! eDh;VEX0fK9-0IWfH  0Q){`##xFW<+*x<$9E[-qWW.(I+6aa@.y24x6_-"bbϫp@t~,/;m%h^ uf@Wp~<5j>{-]cK'Xto(hw?G%fIq^D$.Tʳ?D*A, `6B$BB dr`)B(Ͱ*`/@4Qhp.U=pa( Aa!ڈbX#!H$ ɈQ"K5H1RT UH=r9\F;2G1Q= C7F dt1r=6Ыhڏ>C03l0.B8, c˱" VcϱwE 6wB aAHXLXNH $4 7 Q'"K&b21XH,#/{C7$C2'ITFnR#,4H#dk9, +ȅ3![ b@qS(RjJ4e2AURݨT5ZBRQ4u9̓IKhhitݕNWGw Ljg(gwLӋT071oUX**| J&*/Tު UUT^S}FU3S ԖUPSSg;goT?~YYLOCQ_ cx,!k u5&|v*=9C3J3WRf?qtN (~))4L1e\kXHQG6EYAJ'\'GgSSݧ M=:.kDwn^Loy}/TmG X $ <5qo</QC]@Caaᄑ.ȽJtq]zۯ6iܟ4)Y3sCQ? 0k߬~OCOg#/c/Wװwa>>r><72Y_7ȷOo_C#dz%gA[z|!?:eAAA!h쐭!ΑiP~aa~ 'W?pX15wCsDDDޛg1O9-J5*>.j<74?.fYXXIlK9.*6nl {/]py.,:@LN8A*%w% yg"/6шC\*NH*Mz쑼5y$3,幄'L Lݛ:v m2=:1qB!Mggfvˬen/kY- BTZ(*geWf͉9+̳ې7ᒶKW-X潬j9(xoʿܔĹdff-[n ڴ VE/(ۻCɾUUMfeI?m]Nmq#׹=TR+Gw- 6 U#pDy  :v{vg/jBFS[b[O>zG499?rCd&ˮ/~јѡ򗓿m|x31^VwwO| (hSЧc3- cHRMz%u0`:o_F/qIDATx{|?nBH@ -U"-h*jkOZAA+UER+mTbUmA[ $!dg?f6l6޲|޾@9s3眙Ua1`1`1`1`1`1`1`1`1`Q}Li&@d$ MJOL&@?ByG(8UM3X1B [z_sޛw䃬$""*ɀuO^ r=`f*11x=eG5ڜ" UPw\w{tXCDDT,R"4r0~ Y"$?+qqg805DDDEӧ4u൯ 1˽peAF8rET7,u#3޻| ""*uŘfX^ؑ+,ꕐ#YF6q\NDD4LL$ui@`b %b}CDDE f7JeŅ++ORpcMcQ1{>筵F5WO D1cbQ1aKj!Eقgv'f낈AiP XYDDT R!}'QqB (S桬1ÔGDD (c.r׸DDDD3`1`1`1`Q,BA%`1`1`1`QRUNIp_B0\C}폈GU@Oh';U6C*IԽ_*m0h#ŘYo޴'~l?>(m@c cƊ@ `^6vK a YVȏm4< Q "L8?5Cd, A3 HxKo PJoFWי5܉}bED=$:c!;? `\8?K݅=b+bƊ @`c^w.W^ÅFջڨ*DhOǍUŪ1lXxgvra!bȬ 8o{XDVǚs8~!$IU frJ6h/{9JXcCYދc=GPq B p@s: K i]N:`_w%PFJ+<4I/KQ˜_p5URol w X]q#z| }(l{Twkw2qCDDW=\ }k6{ .I)媬l^d/eH$"R XiV˜Z| 9@s{7/rwݥX?Ի?9\NTP8EWIJ#X/l^|uږfU6oYL7.5`%qAiG2(y(]95ZeUG*>0eQJ?\N2`i?e'' ,9$gr$$pU 1` w!M-.xk(A(U5u!k^P'f!"J)3&JW1+?G[#"$"8VӇ>7kQG=k'$qʏ兟^7tӵA-G:-nu 5`?Hu/{/g5XoV_+Mvhl|/Gpx_-8aٿq;>.,uyO+*d$jD0 (hhqa[H`0>ppZ[] *q`^Kg߷e<`"zëlۚE{ܜB]$/񌜻(1QW ~4@M[޺rӵޏ-9Xn%VMN?Lu՚#A`TתꆘzϪΣ43F]f,{>iU %pŵ6.a{VoܶsQDFy cA(N]9*=^3]'/-NosP剤G^6_w>Vxw - :.TuPYe屇:#]' 2rG˜hL7Z7m^p˾\ݨN?,Z|09o\$Iݫ sg7q:u 2rIz*_#Mߙn 2bscE v=6a}ƶDuVj-7L~22bޚ˭$P+fÛp0`kECXR{scE 1]IPP\﫪[up{m[||hyw[L^X1c;{ۺ`"{zoY˯:R,{!L12Lusuo16mY0Aj/_uXB@v6].F.?׺n q}۔h069oۦ @j箙y>Q,Pkb y g3`m.%;xDvt>N(:'(CsbBwun2?hmLAk˂ 2ba @ `2Yӛ;#zi7Lzc/16`+k_ȍ*vzw#PvYK~e-Ĵ*߁:pjo- &Dtg1D3]Zvmop>y_2l XTxaN;9.ٚg Z,+ @Vl>&5 y?78A!>ٲ`lY0An:c`; vH]CD;^/Aa+\+pU;:}\!HЫs[_6kmo=8{1\dqOWz˰YO}pkCblok Yc羹/`_'uxbjIJI?h,1P'1`QR_>>߿G,yV!r ,O` LwjJ&\ b*|x2؀uF__E;sxU*Jv7 Q(X#aP+16}׹`My膛wxoò˷2u+7X2uۮ ïX}<.3dzo{14F¢dzlޖ#W} wMFϗͶgUonw> WîXs9?ڙJ4 5>DMDC|gM!!ɟs5Mr5:jbPmՌ1@j W:/*;/} 2tΪT̽0ށCtj׷^3CTTs+ֲ"J b1|$PNjDiJ)e؜^ۯ?V,}"bِ`˦z f@;-^ye1~˾WU%,YӅYlVq!NtCg=}DXv}OW:ۿ|_?Az~FnKJß.>y} /T+f`ŧRI ULv&m۱Pn{D}+#6eNN)_ֈbXeu 9}/F)?OD׬^]&*(7!x-^ǯvYup,paGB zv%rb Vۆh ~O^dݱp7Ԙٰ;(9xEEreaU ڜVߍ āLX*p@;?Xi 'bk:qȓ`gqU￷~;0bA@(N? +:"r5;TԯIEEB $8` eg,?%GfJ2`afӱuH%nVtaZ/#DE.ۈN׹0XE?Zаg|A 9f ftv^c7L!'>R&kbv` m̙bYPc _93U u1[#IS[iw ;4] a7ǓmC{Z^ma `u[^ݗxnjoYߣ @v-8e_顊NXB~͛wZ݆)BB1&P6 l^ZO7U{`lE*wP'PXuDN}:u dF87SĀϝp`M5Y`_},׿=/Ì˿OwtILĄh]jZlG냗]]3gLi1ivp N6,quE7]װtʳ:Z(1㗈eO*v#oQdeSqݰ܌^ӰI{߶rUbOco@>` N=NŅ+.T^W^KS|zS XNܯ< ~4n=bpͥd]g]KcHaצ 8cF/ޮ:Uc",vi6kA߮f@\G!"< ϑBW.^Gnid dХϜ~FCW$9G;F˪noYKV9m[6|nb/a5D՜߸/\|c81so_AƪPvÕ&돺ٻ皜սrpzω_yva!$>71h3_ꬅ }Ӥ'cAbD7ѫQ,k$ߜ!.vA ?8kG'̔We_~۾p-{\ZXs25bAu ׏_RsۓMĀE9w#q}-|?tXUZ#А-`  U:ojŞp~⳵!W}Էc?s4ԃNd<tǔY@7յՍu޼ S{ *'jI("M;N0 (`F(Ӵ) UIіSB DzmͿkӏwmww^b Kꑓsp9IHEruo; =#u aR݀V|wCCXQ)stޢSaȲ_{ϛ{p#ۊ ]iBp6_'`yZթBĆ5d ^kV[7|†IZ3g{W m\2٘hgD58AW +5eQd.U?Qс$.Ta;;^%m(t)Oj(툚>8E ., x k{:b  M^[hZuX0pp;7vat^`,uoi{vhJq>H'onc¤6w=&FE>$VX\L@zH%~υcGn[RU7b"P1Pcyˆ7r 0Vh nU {w.!dϢgQ=lpً庎.U/WU^P"Z&~f]Mt]맩:P՜n?QХN<1W5다-7L72VEݲ@(r PV@jU3,1S0'+2n&jǔmn載iĮ*QT[תx#D+qr# [~Sz@SnkCuV.H9;M>!ߵqQޣHOc=e}Y 烴kUh~GZ}ӅWs>U#:h@U\Wyٚ_ƛR.ivtO9_#mԴf==șb dmqB@,MMnPyŷb D%`J˔GXSv#PWkgyKul0 X!h<5w[n(=go<鴹4sZ70+f"l V?q͊(Q,/d}~YG릨lAL"Dcyg %DCVӅ@)L^n_=Y*S0t9mx&ufVDN!+rӰi:ȪyYbU[o=CB⾦lboZ=HgUEyMv;1b5v~x{[3 =*x{{s. V9AG#֟N~ӡvp1^^(ˢ+ٹ +r m_X64æx%=wGObDLbf Y߱-׶Oyk| m3d8 ;t8$ Qб1EZ#^I{utfj*_J}εŒ&m_I~en.&s-`i\ WM=\'h&7.\ޏ^4oEeLuLHk< `J.ڗ}_DoĴIK%́ԎB\ ozCo7;Y\-8q(Nt%6$}NmU?nClz-."I^,|-x{[^K2`kcp .3/v%'y)Gvww~=TT_B._]w#X@lZhK}! /=F݌b-=F?Z=506 @䭾m*\g0&15ne TPֆ0vpcw_  b05O|~q.% Zq>70be%>ı/;8:qM2,Np֋u޲)O-=&I>u@p9ӻHY" X$<5^X"RV>L}e0?_ߕ%WRם ,lF{mG$W*L`(hf|7w V]˟[̔k9 1WD zUMT(rw(rD ~MT!nd7vjg6n1\^=F|>^Dh; WvUQ*+#^4.\1`YsImrsADi,-xؿ=:dע7ۣ{n)+5x-Z\.~?׵2?/ʍ~,?//V[\m<: |rK X~/F 靣%U6e><\ *3ԇWXaK`͇-hjsq8FP4xV?s[S?Y[9z=gXsިCܢX;#oNRGϫ/xkz #CTd j]mƼ]aӻ];poqW>/:F8΍.2){/U\sl7k-r>^d^wo_3?#T:!q V=&^窴BVtqM?)O:w%uN_ >(ޠ~kK+j7\Tj+Q%cqFu)?\gnGS*?ݭi5rVnk{˔'ug};Ϲ7~Y*_I|{oL=_=- 3])={/9Ao/?xK>K_6ò'8U7>KmLp+帧d+eiӺa_W8/?zs⒝uհB 6^y ᦇ:V\{_?L~`R턵N?,|@OGGZpvy.7!+{yϿ߾_|\#0v2= cF' ] hoY/<@;\ HuEuI Xlk Y!FcWxϢq_D$!kuvzLNlg}W"n?:1$c=. ~uʫƄڴ&ih2 1`nķxoAHS01zꌒupSNf[kp7 D3܋ #L1?[:D$nV8iԁ& WTKcNuM u[,S΋^Ө, bABmVMjqvB.7>i\O!AD.: 꽀E]x/,JDD#`Qxg"""","""","""",""""b""""b""""b""""","""","""",""""b""""b""""b""""b""""+fCe;IENDB`golang-github-kelindar-simd-1.2.0/.github/workflows/000077500000000000000000000000001517522302000223545ustar00rootroot00000000000000golang-github-kelindar-simd-1.2.0/.github/workflows/test.yml000066400000000000000000000011651517522302000240610ustar00rootroot00000000000000name: Test on: [push] env: GITHUB_TOKEN: ${{ secrets.COVERALLS_TOKEN }} GO111MODULE: "on" jobs: test: name: Test with Coverage runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 - uses: actions/setup-go@v6 with: go-version: "1.25" - name: Install dependencies run: | go mod download - name: Run Unit Tests run: | go test -race -covermode atomic -coverprofile=profile.cov . - name: Upload Coverage uses: shogo82148/actions-goveralls@v1 with: path-to-profile: profile.cov golang-github-kelindar-simd-1.2.0/.gitignore000066400000000000000000000000131517522302000207410ustar00rootroot00000000000000temp.s cmdgolang-github-kelindar-simd-1.2.0/LICENSE000066400000000000000000000021061517522302000177630ustar00rootroot00000000000000MIT License Copyright (c) 2026 Roman Atachiants Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. golang-github-kelindar-simd-1.2.0/README.md000066400000000000000000000553241517522302000202470ustar00rootroot00000000000000

kelindar/simd
Go Version PkgGoDev License Coverage

## Vectorized Math Functions This library contains a set of vectorized mathematical functions which were [auto-vectorized](https://llvm.org/docs/Vectorizers.html) using clang compiler and translated into PLAN9 assembly code for Go. Generic version is also provided for CPUs where vectorization is not available, or for which this library doesn't have a generated code. It currently supports `AVX2` on `amd64` and `NEON` (Advanced SIMD) on `arm64` (including Apple Silicon). Most of the code in this library is auto-generated, which helps with maintenance. ## Usage The API is intentionally simple and non-opinionated: - Reduction ops: `Sum*`, `Min*`, `Max*` - Element-wise ops: `Add*`, `Sub*`, `Mul*`, `Div*` - Typed fast paths for `int*`, `uint*`, `float*` slices - Generic fallback when SIMD is unavailable ### Examples Compute a sum: ```go sum := simd.SumFloat32s([]float32{1, 2, 3, 4, 5}) ``` Element-wise add into a destination buffer: ```go a := []float32{1, 2, 3, 4} b := []float32{10, 20, 30, 40} dst := make([]float32, len(a)) simd.AddFloat32s(dst, a, b) // dst => []float32{11, 22, 33, 44} ``` Generic API (works across numeric slice types): ```go values := []int16{7, 2, 9, 4} min := simd.Min(values) // 2 max := simd.Max(values) // 9 sum := simd.Sum(values) // 22 ``` ## Benchmarks ```go goos: windows goarch: amd64 pkg: github.com/kelindar/simd cpu: 13th Gen Intel(R) Core(TM) i7-13700K TYPE OP SIZE RATE SPEEDUP uint8 sum 256 3.43 ns/op 13.68x uint8 min 256 3.49 ns/op 19.12x uint8 max 256 3.48 ns/op 21.08x uint8 add 256 4.23 ns/op 19.37x uint8 sub 256 4.25 ns/op 19.44x uint8 mul 256 6.06 ns/op 13.52x uint8 div 256 289.94 ns/op 1.20x uint8 sum 4096 16.72 ns/op 46.02x uint8 min 4096 16.59 ns/op 55.02x uint8 max 4096 16.90 ns/op 54.28x uint8 add 4096 30.64 ns/op 38.13x uint8 sub 4096 29.15 ns/op 39.93x uint8 mul 4096 79.53 ns/op 14.69x uint8 div 4096 4646.76 ns/op 1.19x uint8 sum 16384 58.88 ns/op 52.57x uint8 min 16384 59.61 ns/op 61.48x uint8 max 16384 59.34 ns/op 61.39x uint8 add 16384 159.51 ns/op 29.68x uint8 sub 16384 147.56 ns/op 31.72x uint8 mul 16384 313.20 ns/op 14.88x uint8 div 16384 18551.34 ns/op 1.18x TYPE OP SIZE RATE SPEEDUP uint16 sum 256 4.58 ns/op 9.93x uint16 min 256 4.27 ns/op 32.99x uint16 max 256 4.45 ns/op 19.57x uint16 add 256 5.53 ns/op 17.34x uint16 sub 256 5.53 ns/op 18.43x uint16 mul 256 5.53 ns/op 18.62x uint16 div 256 342.14 ns/op 1.01x uint16 sum 4096 32.29 ns/op 23.92x uint16 min 4096 30.82 ns/op 75.31x uint16 max 4096 31.26 ns/op 49.09x uint16 add 4096 57.24 ns/op 25.48x uint16 sub 4096 57.14 ns/op 26.93x uint16 mul 4096 46.25 ns/op 34.19x uint16 div 4096 5439.07 ns/op 1.00x uint16 sum 16384 114.01 ns/op 27.21x uint16 min 16384 110.94 ns/op 84.12x uint16 max 16384 112.55 ns/op 54.88x uint16 add 16384 535.12 ns/op 10.39x uint16 sub 16384 970.32 ns/op 6.34x uint16 mul 16384 986.37 ns/op 12.71x uint16 div 16384 31041.77 ns/op 0.79x TYPE OP SIZE RATE SPEEDUP uint32 sum 256 10.71 ns/op 9.66x uint32 min 256 11.45 ns/op 16.55x uint32 max 256 11.10 ns/op 16.65x uint32 add 256 20.22 ns/op 7.82x uint32 sub 256 20.59 ns/op 7.67x uint32 mul 256 30.25 ns/op 5.88x uint32 div 256 370.89 ns/op 1.03x uint32 sum 4096 125.98 ns/op 12.72x uint32 min 4096 133.07 ns/op 22.64x uint32 max 4096 132.61 ns/op 22.74x uint32 add 4096 408.41 ns/op 6.22x uint32 sub 4096 412.60 ns/op 6.16x uint32 mul 4096 507.06 ns/op 5.55x uint32 div 4096 6041.48 ns/op 1.00x uint32 sum 16384 649.25 ns/op 10.07x uint32 min 16384 637.87 ns/op 18.85x uint32 max 16384 645.31 ns/op 18.70x uint32 add 16384 1975.68 ns/op 5.06x uint32 sub 16384 1991.51 ns/op 4.94x uint32 mul 16384 2033.82 ns/op 5.41x uint32 div 16384 24277.79 ns/op 0.99x TYPE OP SIZE RATE SPEEDUP uint64 sum 256 18.53 ns/op 5.58x uint64 min 256 95.22 ns/op 1.98x uint64 max 256 98.92 ns/op 1.91x uint64 add 256 36.25 ns/op 4.35x uint64 sub 256 35.94 ns/op 4.44x uint64 mul 256 101.35 ns/op 1.75x uint64 div 256 383.12 ns/op 1.00x uint64 sum 4096 296.89 ns/op 5.39x uint64 min 4096 1593.28 ns/op 1.90x uint64 max 4096 1572.11 ns/op 1.92x uint64 add 4096 976.07 ns/op 2.55x uint64 sub 4096 984.38 ns/op 2.56x uint64 mul 4096 1709.65 ns/op 1.63x uint64 div 4096 6072.12 ns/op 1.01x uint64 sum 16384 1280.59 ns/op 5.17x uint64 min 16384 6189.62 ns/op 1.96x uint64 max 16384 6194.25 ns/op 1.93x uint64 add 16384 4021.97 ns/op 2.55x uint64 sub 16384 3982.78 ns/op 2.57x uint64 mul 16384 6725.83 ns/op 1.64x uint64 div 16384 24463.65 ns/op 1.02x TYPE OP SIZE RATE SPEEDUP int8 sum 256 6.60 ns/op 16.73x int8 min 256 7.26 ns/op 19.91x int8 max 256 7.26 ns/op 20.19x int8 add 256 9.21 ns/op 18.82x int8 sub 256 9.39 ns/op 18.27x int8 mul 256 17.92 ns/op 9.83x int8 div 256 818.03 ns/op 0.71x int8 sum 4096 38.16 ns/op 42.02x int8 min 4096 38.91 ns/op 57.41x int8 max 4096 38.70 ns/op 56.66x int8 add 4096 75.48 ns/op 36.97x int8 sub 4096 74.46 ns/op 37.00x int8 mul 4096 226.79 ns/op 11.85x int8 div 4096 13120.54 ns/op 0.69x int8 sum 16384 131.28 ns/op 49.58x int8 min 16384 131.36 ns/op 68.74x int8 max 16384 132.08 ns/op 68.57x int8 add 16384 417.09 ns/op 26.35x int8 sub 16384 411.26 ns/op 26.84x int8 mul 16384 900.74 ns/op 12.24x int8 div 16384 52317.05 ns/op 0.69x TYPE OP SIZE RATE SPEEDUP int16 sum 256 8.17 ns/op 13.64x int16 min 256 8.50 ns/op 22.13x int16 max 256 8.49 ns/op 21.84x int16 add 256 12.55 ns/op 14.16x int16 sub 256 12.90 ns/op 13.65x int16 mul 256 12.81 ns/op 15.47x int16 div 256 523.61 ns/op 1.10x int16 sum 4096 66.69 ns/op 23.65x int16 min 4096 66.74 ns/op 45.50x int16 max 4096 66.81 ns/op 44.62x int16 add 4096 130.95 ns/op 21.24x int16 sub 4096 130.76 ns/op 21.18x int16 mul 4096 130.26 ns/op 23.53x int16 div 4096 8162.28 ns/op 1.12x int16 sum 16384 290.53 ns/op 23.18x int16 min 16384 303.05 ns/op 39.85x int16 max 16384 306.08 ns/op 38.30x int16 add 16384 1000.12 ns/op 11.27x int16 sub 16384 996.05 ns/op 11.19x int16 mul 16384 1009.52 ns/op 12.17x int16 div 16384 32518.88 ns/op 1.13x TYPE OP SIZE RATE SPEEDUP int32 sum 256 10.79 ns/op 9.84x int32 min 256 11.42 ns/op 16.40x int32 max 256 10.96 ns/op 17.12x int32 add 256 20.39 ns/op 7.45x int32 sub 256 19.83 ns/op 7.16x int32 mul 256 30.90 ns/op 5.61x int32 div 256 379.47 ns/op 1.01x int32 sum 4096 130.67 ns/op 12.45x int32 min 4096 134.71 ns/op 22.40x int32 max 4096 125.29 ns/op 24.25x int32 add 4096 412.80 ns/op 6.18x int32 sub 4096 417.97 ns/op 6.11x int32 mul 4096 505.23 ns/op 5.21x int32 div 4096 6085.25 ns/op 1.00x int32 sum 16384 667.40 ns/op 9.69x int32 min 16384 655.49 ns/op 18.18x int32 max 16384 648.01 ns/op 18.86x int32 add 16384 1995.43 ns/op 5.04x int32 sub 16384 1961.25 ns/op 5.03x int32 mul 16384 2040.80 ns/op 5.19x int32 div 16384 24338.73 ns/op 1.00x TYPE OP SIZE RATE SPEEDUP int64 sum 256 9.26 ns/op 11.19x int64 min 256 25.42 ns/op 3.39x int64 max 256 80.10 ns/op 1.58x int64 add 256 36.52 ns/op 4.34x int64 sub 256 36.71 ns/op 4.36x int64 mul 256 106.45 ns/op 1.63x int64 div 256 380.91 ns/op 1.03x int64 sum 4096 295.11 ns/op 5.59x int64 min 4096 1132.89 ns/op 2.68x int64 max 4096 1165.34 ns/op 2.61x int64 add 4096 997.97 ns/op 2.53x int64 sub 4096 976.80 ns/op 2.58x int64 mul 4096 1721.22 ns/op 1.63x int64 div 4096 6124.12 ns/op 1.01x int64 sum 16384 1279.70 ns/op 5.17x int64 min 16384 4355.66 ns/op 2.79x int64 max 16384 4553.27 ns/op 2.61x int64 add 16384 4003.71 ns/op 2.55x int64 sub 16384 4150.76 ns/op 2.45x int64 mul 16384 6037.59 ns/op 2.43x int64 div 16384 24871.54 ns/op 0.99x TYPE OP SIZE RATE SPEEDUP float32 sum 256 12.07 ns/op 12.44x float32 min 256 12.33 ns/op 11.50x float32 max 256 11.36 ns/op 13.25x float32 add 256 19.95 ns/op 8.08x float32 sub 256 19.54 ns/op 7.94x float32 mul 256 19.58 ns/op 8.24x float32 div 256 59.00 ns/op 5.31x float32 sum 4096 132.69 ns/op 22.35x float32 min 4096 131.46 ns/op 17.27x float32 max 4096 131.15 ns/op 16.92x float32 add 4096 370.71 ns/op 6.69x float32 sub 4096 415.25 ns/op 6.06x float32 mul 4096 412.00 ns/op 5.93x float32 div 4096 946.05 ns/op 5.12x float32 sum 16384 623.06 ns/op 19.16x float32 min 16384 650.93 ns/op 13.67x float32 max 16384 640.29 ns/op 14.44x float32 add 16384 2056.12 ns/op 4.95x float32 sub 16384 2002.50 ns/op 4.99x float32 mul 16384 2048.68 ns/op 4.79x float32 div 16384 4053.14 ns/op 5.01x TYPE OP SIZE RATE SPEEDUP float64 sum 256 19.07 ns/op 8.82x float64 min 256 19.35 ns/op 7.59x float64 max 256 19.11 ns/op 7.89x float64 add 256 37.08 ns/op 4.17x float64 sub 256 32.91 ns/op 5.00x float64 mul 256 36.15 ns/op 4.44x float64 div 256 505.90 ns/op 1.01x float64 sum 4096 268.04 ns/op 11.24x float64 min 4096 284.08 ns/op 7.97x float64 max 4096 301.93 ns/op 7.79x float64 add 4096 1013.73 ns/op 2.53x float64 sub 4096 992.44 ns/op 2.54x float64 mul 4096 967.94 ns/op 2.62x float64 div 4096 7182.29 ns/op 1.10x float64 sum 16384 1242.69 ns/op 9.33x float64 min 16384 1268.02 ns/op 7.10x float64 max 16384 1273.06 ns/op 7.15x float64 add 16384 4086.44 ns/op 2.47x float64 sub 16384 4026.68 ns/op 2.53x float64 mul 16384 4163.52 ns/op 2.41x float64 div 16384 32172.72 ns/op 0.98x PASS ``` Below are the results for the Apple M3 Pro (Apple Silicon) machine. ```go oos: darwin goarch: arm64 pkg: github.com/kelindar/simd cpu: Apple M3 Pro TYPE OP SIZE RATE SPEEDUP uint8 sum 256 4.91 ns/op 14.77x uint8 min 256 4.85 ns/op 37.35x uint8 max 256 5.08 ns/op 36.97x uint8 add 256 8.29 ns/op 13.05x uint8 sub 256 8.31 ns/op 12.96x uint8 mul 256 8.71 ns/op 12.48x uint8 div 256 130.96 ns/op 1.12x uint8 sum 4096 48.78 ns/op 21.38x uint8 min 4096 51.58 ns/op 61.09x uint8 max 4096 48.59 ns/op 65.16x uint8 add 4096 59.04 ns/op 27.71x uint8 sub 4096 59.42 ns/op 27.47x uint8 mul 4096 59.57 ns/op 27.45x uint8 div 4096 2074.64 ns/op 1.13x uint8 sum 16384 235.86 ns/op 17.73x uint8 min 16384 234.78 ns/op 53.59x uint8 max 16384 238.65 ns/op 53.78x uint8 add 16384 277.92 ns/op 23.88x uint8 sub 16384 275.56 ns/op 24.04x uint8 mul 16384 280.81 ns/op 23.59x uint8 div 16384 8163.29 ns/op 1.15x TYPE OP SIZE RATE SPEEDUP uint16 sum 256 6.80 ns/op 10.69x uint16 min 256 6.91 ns/op 26.50x uint16 max 256 6.94 ns/op 26.21x uint16 add 256 11.80 ns/op 9.16x uint16 sub 256 11.87 ns/op 9.20x uint16 mul 256 11.80 ns/op 9.26x uint16 div 256 129.94 ns/op 1.12x uint16 sum 4096 108.85 ns/op 10.18x uint16 min 4096 105.79 ns/op 29.66x uint16 max 4096 106.19 ns/op 29.43x uint16 add 4096 112.26 ns/op 14.59x uint16 sub 4096 118.79 ns/op 13.80x uint16 mul 4096 116.68 ns/op 14.21x uint16 div 4096 2056.41 ns/op 1.13x uint16 sum 16384 529.53 ns/op 7.93x uint16 min 16384 497.89 ns/op 25.35x uint16 max 16384 512.68 ns/op 24.40x uint16 add 16384 548.44 ns/op 11.99x uint16 sub 16384 579.93 ns/op 11.40x uint16 mul 16384 526.33 ns/op 12.69x uint16 div 16384 8454.70 ns/op 1.11x TYPE OP SIZE RATE SPEEDUP uint32 sum 256 11.29 ns/op 6.73x uint32 min 256 11.25 ns/op 11.40x uint32 max 256 11.30 ns/op 11.11x uint32 add 256 17.86 ns/op 6.09x uint32 sub 256 18.64 ns/op 5.79x uint32 mul 256 17.84 ns/op 6.10x uint32 div 256 132.50 ns/op 1.11x uint32 sum 4096 240.50 ns/op 4.46x uint32 min 4096 246.75 ns/op 8.63x uint32 max 4096 242.99 ns/op 8.72x uint32 add 4096 254.21 ns/op 6.49x uint32 sub 4096 258.73 ns/op 6.33x uint32 mul 4096 280.35 ns/op 5.87x uint32 div 4096 2187.58 ns/op 1.12x uint32 sum 16384 1039.29 ns/op 4.20x uint32 min 16384 1067.80 ns/op 7.97x uint32 max 16384 1023.83 ns/op 8.29x uint32 add 16384 887.07 ns/op 7.41x uint32 sub 16384 889.97 ns/op 7.66x uint32 mul 16384 886.21 ns/op 7.45x uint32 div 16384 9012.67 ns/op 1.04x TYPE OP SIZE RATE SPEEDUP uint64 sum 256 21.81 ns/op 3.38x uint64 min 256 42.46 ns/op 2.95x uint64 max 256 41.39 ns/op 3.08x uint64 add 256 30.89 ns/op 3.52x uint64 sub 256 30.91 ns/op 3.49x uint64 mul 256 74.32 ns/op 1.45x uint64 div 256 134.35 ns/op 1.10x uint64 sum 4096 491.83 ns/op 2.12x uint64 min 4096 981.65 ns/op 2.17x uint64 max 4096 992.13 ns/op 2.11x uint64 add 4096 549.37 ns/op 2.97x uint64 sub 4096 484.83 ns/op 3.50x uint64 mul 4096 1091.51 ns/op 1.50x uint64 div 4096 2136.43 ns/op 1.09x uint64 sum 16384 2091.84 ns/op 2.13x uint64 min 16384 4061.30 ns/op 2.07x uint64 max 16384 4356.20 ns/op 1.97x uint64 add 16384 3391.09 ns/op 1.95x uint64 sub 16384 3518.09 ns/op 1.88x uint64 mul 16384 4433.94 ns/op 1.48x uint64 div 16384 8670.50 ns/op 1.09x TYPE OP SIZE RATE SPEEDUP int8 sum 256 4.80 ns/op 15.42x int8 min 256 4.86 ns/op 38.25x int8 max 256 4.86 ns/op 37.66x int8 add 256 8.38 ns/op 13.32x int8 sub 256 8.24 ns/op 13.54x int8 mul 256 8.71 ns/op 12.38x int8 div 256 129.52 ns/op 1.12x int8 sum 4096 49.14 ns/op 21.24x int8 min 4096 50.77 ns/op 60.68x int8 max 4096 48.70 ns/op 63.33x int8 add 4096 62.65 ns/op 26.14x int8 sub 4096 62.24 ns/op 26.46x int8 mul 4096 59.96 ns/op 27.77x int8 div 4096 2073.18 ns/op 1.17x int8 sum 16384 247.78 ns/op 16.55x int8 min 16384 257.64 ns/op 52.10x int8 max 16384 236.66 ns/op 53.97x int8 add 16384 262.95 ns/op 26.29x int8 sub 16384 254.03 ns/op 27.76x int8 mul 16384 272.69 ns/op 26.59x int8 div 16384 8479.32 ns/op 1.15x TYPE OP SIZE RATE SPEEDUP int16 sum 256 7.05 ns/op 10.97x int16 min 256 7.19 ns/op 26.90x int16 max 256 6.90 ns/op 26.61x int16 add 256 13.51 ns/op 8.06x int16 sub 256 12.27 ns/op 9.59x int16 mul 256 12.21 ns/op 8.90x int16 div 256 130.96 ns/op 1.13x int16 sum 4096 112.66 ns/op 9.51x int16 min 4096 108.11 ns/op 28.60x int16 max 4096 108.40 ns/op 29.06x int16 add 4096 125.41 ns/op 13.54x int16 sub 4096 119.49 ns/op 13.86x int16 mul 4096 123.22 ns/op 13.78x int16 div 4096 2074.25 ns/op 1.11x int16 sum 16384 494.65 ns/op 8.57x int16 min 16384 489.44 ns/op 25.05x int16 max 16384 493.22 ns/op 25.27x int16 add 16384 522.42 ns/op 12.49x int16 sub 16384 535.28 ns/op 12.33x int16 mul 16384 559.30 ns/op 11.72x int16 div 16384 8296.67 ns/op 1.12x TYPE OP SIZE RATE SPEEDUP int32 sum 256 11.30 ns/op 6.41x int32 min 256 11.29 ns/op 11.20x int32 max 256 11.28 ns/op 11.24x int32 add 256 17.78 ns/op 6.06x int32 sub 256 17.78 ns/op 6.09x int32 mul 256 17.78 ns/op 6.07x int32 div 256 129.66 ns/op 1.13x int32 sum 4096 236.73 ns/op 4.40x int32 min 4096 237.77 ns/op 8.79x int32 max 4096 235.05 ns/op 8.97x int32 add 4096 225.48 ns/op 7.24x int32 sub 4096 240.99 ns/op 6.79x int32 mul 4096 258.20 ns/op 6.33x int32 div 4096 2075.19 ns/op 1.11x int32 sum 16384 1011.10 ns/op 4.18x int32 min 16384 1011.42 ns/op 8.41x int32 max 16384 1002.50 ns/op 8.39x int32 add 16384 881.46 ns/op 7.42x int32 sub 16384 884.55 ns/op 7.38x int32 mul 16384 887.31 ns/op 7.40x int32 div 16384 8352.29 ns/op 1.12x TYPE OP SIZE RATE SPEEDUP int64 sum 256 35.13 ns/op 2.07x int64 min 256 41.41 ns/op 3.08x int64 max 256 41.26 ns/op 3.02x int64 add 256 30.90 ns/op 3.49x int64 sub 256 30.88 ns/op 3.49x int64 mul 256 71.46 ns/op 1.51x int64 div 256 134.15 ns/op 1.09x int64 sum 4096 527.85 ns/op 1.98x int64 min 4096 981.92 ns/op 2.15x int64 max 4096 985.04 ns/op 2.15x int64 add 4096 486.18 ns/op 3.36x int64 sub 4096 476.42 ns/op 3.43x int64 mul 4096 1094.60 ns/op 1.50x int64 div 4096 2141.80 ns/op 1.09x int64 sum 16384 2094.27 ns/op 2.13x int64 min 16384 4036.02 ns/op 2.07x int64 max 16384 4101.59 ns/op 2.07x int64 add 16384 3500.60 ns/op 1.92x int64 sub 16384 3485.66 ns/op 1.88x int64 mul 16384 4372.74 ns/op 1.50x int64 div 16384 9099.17 ns/op 1.05x TYPE OP SIZE RATE SPEEDUP float32 sum 256 11.76 ns/op 10.34x float32 min 256 11.25 ns/op 19.61x float32 max 256 11.25 ns/op 15.58x float32 add 256 18.06 ns/op 6.11x float32 sub 256 17.85 ns/op 6.11x float32 mul 256 17.81 ns/op 6.05x float32 div 256 21.08 ns/op 5.11x float32 sum 4096 320.75 ns/op 8.54x float32 min 4096 232.22 ns/op 17.46x float32 max 4096 231.89 ns/op 17.14x float32 add 4096 277.66 ns/op 5.87x float32 sub 4096 248.42 ns/op 6.56x float32 mul 4096 240.00 ns/op 6.79x float32 div 4096 288.28 ns/op 5.65x float32 sum 16384 1384.83 ns/op 7.98x float32 min 16384 1009.17 ns/op 16.04x float32 max 16384 1006.63 ns/op 16.19x float32 add 16384 884.13 ns/op 7.39x float32 sub 16384 882.45 ns/op 7.42x float32 mul 16384 882.46 ns/op 7.43x float32 div 16384 1100.18 ns/op 5.95x TYPE OP SIZE RATE SPEEDUP float64 sum 256 27.91 ns/op 4.33x float64 min 256 21.68 ns/op 10.27x float64 max 256 21.79 ns/op 8.05x float64 add 256 30.51 ns/op 3.53x float64 sub 256 30.42 ns/op 3.56x float64 mul 256 30.48 ns/op 3.52x float64 div 256 37.69 ns/op 2.86x float64 sum 4096 669.96 ns/op 4.08x float64 min 4096 489.15 ns/op 8.23x float64 max 4096 499.26 ns/op 7.96x float64 add 4096 485.25 ns/op 3.37x float64 sub 4096 485.85 ns/op 3.37x float64 mul 4096 476.16 ns/op 3.42x float64 div 4096 574.07 ns/op 2.84x float64 sum 16384 2805.05 ns/op 3.90x float64 min 16384 2052.30 ns/op 7.90x float64 max 16384 2070.18 ns/op 7.79x float64 add 16384 3488.30 ns/op 1.87x float64 sub 16384 3492.81 ns/op 1.87x float64 mul 16384 3501.81 ns/op 1.86x float64 div 16384 3490.82 ns/op 1.87x ``` # Acknowledgements This library was originally inspired by the work of Valery Carey & Adrian Witas in [viant/vec](https://github.com/viant/vec) package, but instead of hand-rolled assembly and intrinsics I opted for using auto-vectorization for maintainability reasons. golang-github-kelindar-simd-1.2.0/bench_test.go000066400000000000000000000127541517522302000214350ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. package simd import ( "fmt" "testing" "github.com/stretchr/testify/assert" ) type ops[T Number] struct { sum func([]T) T min func([]T) T max func([]T) T add func([]T, []T, []T) []T sub func([]T, []T, []T) []T mul func([]T, []T, []T) []T div func([]T, []T, []T) []T } type suite struct { name string benchmark func(*testing.B) testOps func(*testing.T) testFallback func(*testing.T) } func makeSuite[T Number](name string, fn ops[T]) suite { return suite{ name: name, benchmark: func(b *testing.B) { benchmarkOps(b, name, fn) }, testOps: func(t *testing.T) { testOps(t, fn) }, testFallback: func(t *testing.T) { withHardwareDisabled(func() { testOps(t, fn) }) }, } } func suites() []suite { return []suite{ makeSuite("uint8", ops[uint8]{sum: SumUint8s, min: MinUint8s, max: MaxUint8s, add: AddUint8s, sub: SubUint8s, mul: MulUint8s, div: DivUint8s}), makeSuite("uint16", ops[uint16]{sum: SumUint16s, min: MinUint16s, max: MaxUint16s, add: AddUint16s, sub: SubUint16s, mul: MulUint16s, div: DivUint16s}), makeSuite("uint32", ops[uint32]{sum: SumUint32s, min: MinUint32s, max: MaxUint32s, add: AddUint32s, sub: SubUint32s, mul: MulUint32s, div: DivUint32s}), makeSuite("uint64", ops[uint64]{sum: SumUint64s, min: MinUint64s, max: MaxUint64s, add: AddUint64s, sub: SubUint64s, mul: MulUint64s, div: DivUint64s}), makeSuite("int8", ops[int8]{sum: SumInt8s, min: MinInt8s, max: MaxInt8s, add: AddInt8s, sub: SubInt8s, mul: MulInt8s, div: DivInt8s}), makeSuite("int16", ops[int16]{sum: SumInt16s, min: MinInt16s, max: MaxInt16s, add: AddInt16s, sub: SubInt16s, mul: MulInt16s, div: DivInt16s}), makeSuite("int32", ops[int32]{sum: SumInt32s, min: MinInt32s, max: MaxInt32s, add: AddInt32s, sub: SubInt32s, mul: MulInt32s, div: DivInt32s}), makeSuite("int64", ops[int64]{sum: SumInt64s, min: MinInt64s, max: MaxInt64s, add: AddInt64s, sub: SubInt64s, mul: MulInt64s, div: DivInt64s}), makeSuite("float32", ops[float32]{sum: SumFloat32s, min: MinFloat32s, max: MaxFloat32s, add: AddFloat32s, sub: SubFloat32s, mul: MulFloat32s, div: DivFloat32s}), makeSuite("float64", ops[float64]{sum: SumFloat64s, min: MinFloat64s, max: MaxFloat64s, add: AddFloat64s, sub: SubFloat64s, mul: MulFloat64s, div: DivFloat64s}), } } func BenchmarkSIMD(b *testing.B) { for _, tc := range suites() { b.Run(tc.name, tc.benchmark) } } func TestSIMDOps(t *testing.T) { for _, tc := range suites() { t.Run(tc.name, tc.testOps) } } func TestSIMDFallback(t *testing.T) { for _, tc := range suites() { t.Run(tc.name, tc.testFallback) } } func benchmarkOps[T Number](b *testing.B, typ string, fn ops[T]) { result := make([]Result, 0, 64) for _, count := range []int{256, 4096, 16384} { vector := makeVector[T](count) result = append(result, runBenchmark(b, typ, "sum", count, func(b *testing.B) { var out T for i := 0; i < b.N; i++ { out = fn.sum(vector) } assert.NotZero(b, out) })) result = append(result, runBenchmark(b, typ, "min", count, func(b *testing.B) { var out T for i := 0; i < b.N; i++ { out = fn.min(vector) } assert.NotZero(b, out) })) result = append(result, runBenchmark(b, typ, "max", count, func(b *testing.B) { var out T for i := 0; i < b.N; i++ { out = fn.max(vector) } assert.NotZero(b, out) })) output := make([]T, count) input1 := makeVector[T](count) input2 := makeVector[T](count) result = append(result, runBenchmark(b, typ, "add", count, func(b *testing.B) { var out []T for i := 0; i < b.N; i++ { out = fn.add(output, input1, input2) } assert.NotEmpty(b, out) })) result = append(result, runBenchmark(b, typ, "sub", count, func(b *testing.B) { var out []T for i := 0; i < b.N; i++ { out = fn.sub(output, input1, input2) } assert.NotEmpty(b, out) })) result = append(result, runBenchmark(b, typ, "mul", count, func(b *testing.B) { var out []T for i := 0; i < b.N; i++ { out = fn.mul(output, input1, input2) } assert.NotEmpty(b, out) })) result = append(result, runBenchmark(b, typ, "div", count, func(b *testing.B) { var out []T for i := 0; i < b.N; i++ { out = fn.div(output, input1, input2) } assert.NotEmpty(b, out) })) } fmt.Println() fmt.Println(" TYPE OP SIZE RATE SPEEDUP") for _, r := range result { fmt.Printf("%7s %5s %7d %8.2f ns/op %7.2fx\n", r.Type, r.Name, r.Size, r.Rate, r.Speedup) } } func testOps[T Number](t *testing.T, fn ops[T]) { input := makeVector[T](70) assert.EqualValues(t, sum(input), fn.sum(input)) assert.EqualValues(t, min(input), fn.min(input)) assert.EqualValues(t, max(input), fn.max(input)) input1 := makeVector[T](70) input2 := makeVector[T](70) assert.EqualValues(t, add(make([]T, 70), input1, input2), fn.add(make([]T, 70), input1, input2)) assert.EqualValues(t, sub(make([]T, 70), input1, input2), fn.sub(make([]T, 70), input1, input2)) assert.EqualValues(t, mul(make([]T, 70), input1, input2), fn.mul(make([]T, 70), input1, input2)) assert.InDeltaSlice(t, div(make([]T, 70), input1, input2), fn.div(make([]T, 70), input1, input2), 0.01) } func withHardwareDisabled(fn func()) { prev := hardware hardware = false defer func() { hardware = prev }() fn() } golang-github-kelindar-simd-1.2.0/codegen/000077500000000000000000000000001517522302000203635ustar00rootroot00000000000000golang-github-kelindar-simd-1.2.0/codegen/README.md000066400000000000000000000003731517522302000216450ustar00rootroot00000000000000## Auto-Generate Vectorized Code This folder contains all of the tools that are used to generate [auto-vectorized](https://llvm.org/docs/Vectorizers.html) assembly and translate it to Go PLAN9 assembly using [gocc](https://github.com/kelindar/gocc). golang-github-kelindar-simd-1.2.0/codegen/generate.sh000077500000000000000000000005451517522302000225200ustar00rootroot00000000000000#!/bin/bash set -euo pipefail cd "$(dirname "$0")" go run ./templates/main.go # requires gocc: go install github.com/kelindar/gocc/cmd/gocc@latest #gocc simd_avx2_amd64.c --arch avx2 -O3 --package simd --output ../ #gocc simd_neon_arm64.c --arch neon -O3 --package simd --output ../ #gocc simd_apple_arm64.c --arch apple -O3 --package simd --output ../ golang-github-kelindar-simd-1.2.0/codegen/simd_apple_arm64.c000077500000000000000000000445711517522302000236730ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. #include // ---------------------------------- Uint8 ---------------------------------- void _uint8_sum(uint8_t *input, uint8_t *result, uint64_t size) { uint8_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint8_min(uint8_t *input, uint8_t *result, uint64_t size) { uint8_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint8_max(uint8_t *input, uint8_t *result, uint64_t size) { uint8_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint8_add(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint8_sub(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint8_mul(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint8_div(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Uint16 ---------------------------------- void _uint16_sum(uint16_t *input, uint16_t *result, uint64_t size) { uint16_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint16_min(uint16_t *input, uint16_t *result, uint64_t size) { uint16_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint16_max(uint16_t *input, uint16_t *result, uint64_t size) { uint16_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint16_add(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint16_sub(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint16_mul(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint16_div(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Uint32 ---------------------------------- void _uint32_sum(uint32_t *input, uint32_t *result, uint64_t size) { uint32_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint32_min(uint32_t *input, uint32_t *result, uint64_t size) { uint32_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint32_max(uint32_t *input, uint32_t *result, uint64_t size) { uint32_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint32_add(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint32_sub(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint32_mul(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint32_div(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Uint64 ---------------------------------- void _uint64_sum(uint64_t *input, uint64_t *result, uint64_t size) { uint64_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint64_min(uint64_t *input, uint64_t *result, uint64_t size) { uint64_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint64_max(uint64_t *input, uint64_t *result, uint64_t size) { uint64_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint64_add(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint64_sub(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint64_mul(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint64_div(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int8 ---------------------------------- void _int8_sum(int8_t *input, int8_t *result, uint64_t size) { int8_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int8_min(int8_t *input, int8_t *result, uint64_t size) { int8_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int8_max(int8_t *input, int8_t *result, uint64_t size) { int8_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int8_add(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int8_sub(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int8_mul(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int8_div(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int16 ---------------------------------- void _int16_sum(int16_t *input, int16_t *result, uint64_t size) { int16_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int16_min(int16_t *input, int16_t *result, uint64_t size) { int16_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int16_max(int16_t *input, int16_t *result, uint64_t size) { int16_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int16_add(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int16_sub(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int16_mul(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int16_div(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int32 ---------------------------------- void _int32_sum(int32_t *input, int32_t *result, uint64_t size) { int32_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int32_min(int32_t *input, int32_t *result, uint64_t size) { int32_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int32_max(int32_t *input, int32_t *result, uint64_t size) { int32_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int32_add(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int32_sub(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int32_mul(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int32_div(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int64 ---------------------------------- void _int64_sum(int64_t *input, int64_t *result, uint64_t size) { int64_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int64_min(int64_t *input, int64_t *result, uint64_t size) { int64_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int64_max(int64_t *input, int64_t *result, uint64_t size) { int64_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int64_add(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int64_sub(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int64_mul(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int64_div(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Float32 ---------------------------------- void _float32_sum(float *input, float *result, uint64_t size) { float sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _float32_min(float *input, float *result, uint64_t size) { float min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _float32_max(float *input, float *result, uint64_t size) { float max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _float32_add(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _float32_sub(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _float32_mul(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _float32_div(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Float64 ---------------------------------- void _float64_sum(double *input, double *result, uint64_t size) { double sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _float64_min(double *input, double *result, uint64_t size) { double min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _float64_max(double *input, double *result, uint64_t size) { double max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _float64_add(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _float64_sub(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _float64_mul(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _float64_div(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } golang-github-kelindar-simd-1.2.0/codegen/simd_avx2_amd64.c000077500000000000000000000445711517522302000234340ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. #include // ---------------------------------- Uint8 ---------------------------------- void _uint8_sum(uint8_t *input, uint8_t *result, uint64_t size) { uint8_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint8_min(uint8_t *input, uint8_t *result, uint64_t size) { uint8_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint8_max(uint8_t *input, uint8_t *result, uint64_t size) { uint8_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint8_add(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint8_sub(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint8_mul(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint8_div(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Uint16 ---------------------------------- void _uint16_sum(uint16_t *input, uint16_t *result, uint64_t size) { uint16_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint16_min(uint16_t *input, uint16_t *result, uint64_t size) { uint16_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint16_max(uint16_t *input, uint16_t *result, uint64_t size) { uint16_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint16_add(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint16_sub(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint16_mul(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint16_div(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Uint32 ---------------------------------- void _uint32_sum(uint32_t *input, uint32_t *result, uint64_t size) { uint32_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint32_min(uint32_t *input, uint32_t *result, uint64_t size) { uint32_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint32_max(uint32_t *input, uint32_t *result, uint64_t size) { uint32_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint32_add(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint32_sub(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint32_mul(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint32_div(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Uint64 ---------------------------------- void _uint64_sum(uint64_t *input, uint64_t *result, uint64_t size) { uint64_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint64_min(uint64_t *input, uint64_t *result, uint64_t size) { uint64_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint64_max(uint64_t *input, uint64_t *result, uint64_t size) { uint64_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint64_add(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint64_sub(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint64_mul(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint64_div(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int8 ---------------------------------- void _int8_sum(int8_t *input, int8_t *result, uint64_t size) { int8_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int8_min(int8_t *input, int8_t *result, uint64_t size) { int8_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int8_max(int8_t *input, int8_t *result, uint64_t size) { int8_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int8_add(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int8_sub(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int8_mul(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int8_div(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int16 ---------------------------------- void _int16_sum(int16_t *input, int16_t *result, uint64_t size) { int16_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int16_min(int16_t *input, int16_t *result, uint64_t size) { int16_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int16_max(int16_t *input, int16_t *result, uint64_t size) { int16_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int16_add(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int16_sub(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int16_mul(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int16_div(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int32 ---------------------------------- void _int32_sum(int32_t *input, int32_t *result, uint64_t size) { int32_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int32_min(int32_t *input, int32_t *result, uint64_t size) { int32_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int32_max(int32_t *input, int32_t *result, uint64_t size) { int32_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int32_add(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int32_sub(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int32_mul(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int32_div(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int64 ---------------------------------- void _int64_sum(int64_t *input, int64_t *result, uint64_t size) { int64_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int64_min(int64_t *input, int64_t *result, uint64_t size) { int64_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int64_max(int64_t *input, int64_t *result, uint64_t size) { int64_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int64_add(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int64_sub(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int64_mul(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int64_div(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Float32 ---------------------------------- void _float32_sum(float *input, float *result, uint64_t size) { float sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _float32_min(float *input, float *result, uint64_t size) { float min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _float32_max(float *input, float *result, uint64_t size) { float max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _float32_add(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _float32_sub(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _float32_mul(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _float32_div(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Float64 ---------------------------------- void _float64_sum(double *input, double *result, uint64_t size) { double sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _float64_min(double *input, double *result, uint64_t size) { double min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _float64_max(double *input, double *result, uint64_t size) { double max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _float64_add(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _float64_sub(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _float64_mul(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _float64_div(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } golang-github-kelindar-simd-1.2.0/codegen/simd_neon_arm64.c000077500000000000000000000445711517522302000235310ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. #include // ---------------------------------- Uint8 ---------------------------------- void _uint8_sum(uint8_t *input, uint8_t *result, uint64_t size) { uint8_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint8_min(uint8_t *input, uint8_t *result, uint64_t size) { uint8_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint8_max(uint8_t *input, uint8_t *result, uint64_t size) { uint8_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint8_add(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint8_sub(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint8_mul(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint8_div(uint8_t *input1, uint8_t *input2, uint8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Uint16 ---------------------------------- void _uint16_sum(uint16_t *input, uint16_t *result, uint64_t size) { uint16_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint16_min(uint16_t *input, uint16_t *result, uint64_t size) { uint16_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint16_max(uint16_t *input, uint16_t *result, uint64_t size) { uint16_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint16_add(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint16_sub(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint16_mul(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint16_div(uint16_t *input1, uint16_t *input2, uint16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Uint32 ---------------------------------- void _uint32_sum(uint32_t *input, uint32_t *result, uint64_t size) { uint32_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint32_min(uint32_t *input, uint32_t *result, uint64_t size) { uint32_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint32_max(uint32_t *input, uint32_t *result, uint64_t size) { uint32_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint32_add(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint32_sub(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint32_mul(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint32_div(uint32_t *input1, uint32_t *input2, uint32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Uint64 ---------------------------------- void _uint64_sum(uint64_t *input, uint64_t *result, uint64_t size) { uint64_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _uint64_min(uint64_t *input, uint64_t *result, uint64_t size) { uint64_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _uint64_max(uint64_t *input, uint64_t *result, uint64_t size) { uint64_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _uint64_add(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _uint64_sub(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _uint64_mul(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _uint64_div(uint64_t *input1, uint64_t *input2, uint64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int8 ---------------------------------- void _int8_sum(int8_t *input, int8_t *result, uint64_t size) { int8_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int8_min(int8_t *input, int8_t *result, uint64_t size) { int8_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int8_max(int8_t *input, int8_t *result, uint64_t size) { int8_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int8_add(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int8_sub(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int8_mul(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int8_div(int8_t *input1, int8_t *input2, int8_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int16 ---------------------------------- void _int16_sum(int16_t *input, int16_t *result, uint64_t size) { int16_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int16_min(int16_t *input, int16_t *result, uint64_t size) { int16_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int16_max(int16_t *input, int16_t *result, uint64_t size) { int16_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int16_add(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int16_sub(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int16_mul(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int16_div(int16_t *input1, int16_t *input2, int16_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int32 ---------------------------------- void _int32_sum(int32_t *input, int32_t *result, uint64_t size) { int32_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int32_min(int32_t *input, int32_t *result, uint64_t size) { int32_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int32_max(int32_t *input, int32_t *result, uint64_t size) { int32_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int32_add(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int32_sub(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int32_mul(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int32_div(int32_t *input1, int32_t *input2, int32_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Int64 ---------------------------------- void _int64_sum(int64_t *input, int64_t *result, uint64_t size) { int64_t sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _int64_min(int64_t *input, int64_t *result, uint64_t size) { int64_t min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _int64_max(int64_t *input, int64_t *result, uint64_t size) { int64_t max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _int64_add(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _int64_sub(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _int64_mul(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _int64_div(int64_t *input1, int64_t *input2, int64_t *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Float32 ---------------------------------- void _float32_sum(float *input, float *result, uint64_t size) { float sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _float32_min(float *input, float *result, uint64_t size) { float min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _float32_max(float *input, float *result, uint64_t size) { float max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _float32_add(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _float32_sub(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _float32_mul(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _float32_div(float *input1, float *input2, float *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } // ---------------------------------- Float64 ---------------------------------- void _float64_sum(double *input, double *result, uint64_t size) { double sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _float64_min(double *input, double *result, uint64_t size) { double min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _float64_max(double *input, double *result, uint64_t size) { double max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _float64_add(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _float64_sub(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _float64_mul(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _float64_div(double *input1, double *input2, double *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } golang-github-kelindar-simd-1.2.0/codegen/templates/000077500000000000000000000000001517522302000223615ustar00rootroot00000000000000golang-github-kelindar-simd-1.2.0/codegen/templates/funcs.go.tt000066400000000000000000000050731517522302000244610ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. package simd import "unsafe" {{ range .Types }} // ---------------------------------- {{.Name}} ---------------------------------- // Sum{{.Name}}s sums up all of the elements of the slice and returns the value func Sum{{.Name}}s(input []{{.Type}}) (out {{.Type}}) { switch { case hardware: _{{.Type}}_sum(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return sum(input) } } // Min{{.Name}}s returns the smallest element value in the slice func Min{{.Name}}s(input []{{.Type}}) (out {{.Type}}) { switch { case hardware: _{{.Type}}_min(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return min(input) } } // Max{{.Name}}s returns the largest element value in the slice func Max{{.Name}}s(input []{{.Type}}) (out {{.Type}}) { switch { case hardware: _{{.Type}}_max(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return max(input) } } // Add{{.Name}}s adds input1 to input2 and writes back the result into dst slice func Add{{.Name}}s(dst, input1, input2 []{{.Type}}) []{{.Type}} { switch { case hardware: _{{.Type}}_add(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return add(dst, input1, input2) } // Sub{{.Name}}s subtracts input2 from input1 and writes back the result into dst slice func Sub{{.Name}}s(dst, input1, input2 []{{.Type}}) []{{.Type}} { switch { case hardware: _{{.Type}}_sub(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return sub(dst, input1, input2) } // Mul{{.Name}}s multiplies input1 by input2 and writes back the result into dst slice func Mul{{.Name}}s(dst, input1, input2 []{{.Type}}) []{{.Type}} { switch { case hardware: _{{.Type}}_mul(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return mul(dst, input1, input2) } // Div{{.Name}}s divides input1 by input2 and writes back the result into dst slice func Div{{.Name}}s(dst, input1, input2 []{{.Type}}) []{{.Type}} { switch { case hardware: _{{.Type}}_div(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return div(dst, input1, input2) } {{ end }} golang-github-kelindar-simd-1.2.0/codegen/templates/main.go000066400000000000000000000037261517522302000236440ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. package main import ( "embed" _ "embed" "fmt" "os" "text/template" ) //go:generate go run ./main.go //go:embed *.tt var templates embed.FS type Type struct { Name string Type string CType string } var types = []Type{ {Name: "Uint8", Type: "uint8", CType: "uint8_t"}, {Name: "Uint16", Type: "uint16", CType: "uint16_t"}, {Name: "Uint32", Type: "uint32", CType: "uint32_t"}, {Name: "Uint64", Type: "uint64", CType: "uint64_t"}, {Name: "Int8", Type: "int8", CType: "int8_t"}, {Name: "Int16", Type: "int16", CType: "int16_t"}, {Name: "Int32", Type: "int32", CType: "int32_t"}, {Name: "Int64", Type: "int64", CType: "int64_t"}, {Name: "Float32", Type: "float32", CType: "float"}, {Name: "Float64", Type: "float64", CType: "double"}, } func main() { genCode("amd64", "avx2") genCode("arm64", "neon") genCode("arm64", "apple") genFuncs("simd", "../simd_funcs.go") if err := execute("../simd_stub.go", "stub.go", "---", "---"); err != nil { panic(err) } } // Generates API functions and their tests func genFuncs(arch, dst string) { if err := execute(dst, "funcs.go", arch, "---"); err != nil { panic(err) } } // Generates the underling code for vectorization and companions func genCode(arch, mode string) { if err := execute(fmt.Sprintf("simd_%s_%s.c", mode, arch), "source.c", arch, mode); err != nil { panic(err) } } // Executes the template func execute(dst, src, arch, mode string) error { body, err := templates.ReadFile(src + ".tt") if err != nil { return err } cgen, err := template.New(src).Parse(string(body)) if err != nil { return err } out, err := os.OpenFile(dst, os.O_RDWR|os.O_CREATE|os.O_TRUNC, os.ModePerm) if err != nil { return err } return cgen.Execute(out, struct { Arch string Mode string Types []Type }{ Arch: arch, Mode: mode, Types: types, }) } golang-github-kelindar-simd-1.2.0/codegen/templates/source.c.tt000066400000000000000000000042101517522302000244500ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. #include {{ $Mode := .Mode }} {{ range .Types }} // ---------------------------------- {{.Name}} ---------------------------------- void _{{.Type}}_sum({{.CType}} *input, {{.CType}} *result, uint64_t size) { {{.CType}} sum = 0.0; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; ++i) { sum += input[i]; } *result = sum; } void _{{.Type}}_min({{.CType}} *input, {{.CType}} *result, uint64_t size) { {{.CType}} min = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] < min) { min = input[i]; } } *result = min; } void _{{.Type}}_max({{.CType}} *input, {{.CType}} *result, uint64_t size) { {{.CType}} max = input[0]; #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { if (input[i] > max) { max = input[i]; } } *result = max; } void _{{.Type}}_add({{.CType}} *input1, {{.CType}} *input2, {{.CType}} *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] + input2[i]; } } void _{{.Type}}_sub({{.CType}} *input1, {{.CType}} *input2, {{.CType}} *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] - input2[i]; } } void _{{.Type}}_mul({{.CType}} *input1, {{.CType}} *input2, {{.CType}} *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] * input2[i]; } } void _{{.Type}}_div({{.CType}} *input1, {{.CType}} *input2, {{.CType}} *output, uint64_t size) { #pragma clang loop vectorize(enable) interleave(enable) for (int i = 0; i < (int)size; i++) { output[i] = input1[i] / input2[i]; } } {{ end }} golang-github-kelindar-simd-1.2.0/codegen/templates/stub.go.tt000066400000000000000000000023671517522302000243230ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. //go:build noasm || !(amd64 || arm64) // +build noasm !amd64,!arm64 package simd import "unsafe" func init() { hardware = false } {{ range .Types }} func _{{.Type}}_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _{{.Type}}_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _{{.Type}}_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _{{.Type}}_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _{{.Type}}_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _{{.Type}}_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _{{.Type}}_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } {{ end }} golang-github-kelindar-simd-1.2.0/go.mod000066400000000000000000000004601517522302000200650ustar00rootroot00000000000000module github.com/kelindar/simd go 1.18 require ( github.com/klauspost/cpuid/v2 v2.0.12 github.com/stretchr/testify v1.7.1 ) require ( github.com/davecgh/go-spew v1.1.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect ) golang-github-kelindar-simd-1.2.0/go.sum000066400000000000000000000022631517522302000201150ustar00rootroot00000000000000github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/klauspost/cpuid/v2 v2.0.12 h1:p9dKCg8i4gmOxtv35DvrYoWqYzQrvEVdjQ762Y0OqZE= github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= golang-github-kelindar-simd-1.2.0/simd.go000066400000000000000000000075061517522302000202520ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. package simd //go:generate sh -c "cd ./codegen && ./generate.sh" import ( "runtime" "github.com/klauspost/cpuid/v2" ) var ( avx2 = cpuid.CPU.Supports(cpuid.AVX2) && cpuid.CPU.Supports(cpuid.FMA3) apple = runtime.GOARCH == "arm64" && runtime.GOOS == "darwin" neon = runtime.GOARCH == "arm64" && !apple hardware = avx2 || apple || neon ) // Number represents a number constraint for SIMD operations type Number interface { ~int | ~int8 | ~int16 | ~int32 | ~int64 | uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~float32 | ~float64 } // Sum sums up all of the elements of the slice and returns the value func Sum[T Number](input []T) T { switch v := any(input).(type) { case []int8: return T(SumInt8s(v)) case []int16: return T(SumInt16s(v)) case []int32: return T(SumInt32s(v)) case []int64: return T(SumInt64s(v)) case []uint8: return T(SumUint8s(v)) case []uint16: return T(SumUint16s(v)) case []uint32: return T(SumUint32s(v)) case []uint64: return T(SumUint64s(v)) case []float32: return T(SumFloat32s(v)) case []float64: return T(SumFloat64s(v)) default: return sum(input) } } // Sum sums up all of the elements of the slice and returns the value func sum[T Number](input []T) (sum T) { for _, v := range input { sum += v } return } // Min returns the smallest element value in the slice func Min[T Number](input []T) T { switch v := any(input).(type) { case []int8: return T(MinInt8s(v)) case []int16: return T(MinInt16s(v)) case []int32: return T(MinInt32s(v)) case []int64: return T(MinInt64s(v)) case []uint8: return T(MinUint8s(v)) case []uint16: return T(MinUint16s(v)) case []uint32: return T(MinUint32s(v)) case []uint64: return T(MinUint64s(v)) case []float32: return T(MinFloat32s(v)) case []float64: return T(MinFloat64s(v)) default: return min(input) } } // Min returns the smallest element value in the slice func min[T Number](input []T) T { min := input[0] for _, v := range input[1:] { if v < min { min = v } } return min } // Max returns the largest element value in the slice func Max[T Number](input []T) T { switch v := any(input).(type) { case []int8: return T(MaxInt8s(v)) case []int16: return T(MaxInt16s(v)) case []int32: return T(MaxInt32s(v)) case []int64: return T(MaxInt64s(v)) case []uint8: return T(MaxUint8s(v)) case []uint16: return T(MaxUint16s(v)) case []uint32: return T(MaxUint32s(v)) case []uint64: return T(MaxUint64s(v)) case []float32: return T(MaxFloat32s(v)) case []float64: return T(MaxFloat64s(v)) default: return max(input) } } // Max returns the largest element value in the slice func max[T Number](input []T) T { max := input[0] for _, v := range input[1:] { if v > max { max = v } } return max } // Add adds input1 to input2 and writes back the result into dst slice func add[T Number](dst, input1, input2 []T) []T { for i, v := range input1 { dst[i] = v + input2[i] } return dst } // Sub subtracts input2 from input1 and writes back the result into dst slice func sub[T Number](dst, input1, input2 []T) []T { for i, v := range input1 { dst[i] = v - input2[i] } return dst } // Mul multiplies input1 by input2 and writes back the result into dst slice func mul[T Number](dst, input1, input2 []T) []T { for i, v := range input1 { dst[i] = v * input2[i] } return dst } // Div divides input1 by input2 and writes back the result into dst slice func div[T Number](dst, input1, input2 []T) []T { for i, v := range input1 { dst[i] = v / input2[i] } return dst } golang-github-kelindar-simd-1.2.0/simd_apple_arm64.go000066400000000000000000000200731517522302000224360ustar00rootroot00000000000000//go:build !noasm && darwin && arm64 // AUTO-GENERATED BY GOCC -- DO NOT EDIT package simd import "unsafe" //go:nosplit //go:noescape func _uint8_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) golang-github-kelindar-simd-1.2.0/simd_apple_arm64.s000066400000000000000000004047321517522302000223030ustar00rootroot00000000000000//go:build !noasm && darwin && arm64 // AUTO-GENERATED BY GOCC -- DO NOT EDIT TEXT ·_uint8_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt LBB0_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540000c2 // b.hs LBB0_4 WORD $0xd2800009 // mov x9, #0 WORD $0x5280000a // mov w10, #0 WORD $0x14000029 // b LBB0_13 BB0_3: WORD $0x3900003f // strb wzr, [x1] WORD $0xd65f03c0 // ret BB0_4: WORD $0xf100811f // cmp x8, #32 WORD $0x54000082 // b.hs LBB0_6 WORD $0x5280000a // mov w10, #0 WORD $0xd2800009 // mov x9, #0 WORD $0x14000013 // b LBB0_10 BB0_6: WORD $0x9240104b // and x11, x2, #0x1f WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x9100400a // add x10, x0, #16 WORD $0x6f00e400 // movi.2d v0, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi.2d v1, #0000000000000000 BB0_7: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x4e208440 // add.16b v0, v2, v0 WORD $0x4e218461 // add.16b v1, v3, v1 WORD $0x9100814a // add x10, x10, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x54ffff61 // b.ne LBB0_7 WORD $0x4e208420 // add.16b v0, v1, v0 WORD $0x4e31b800 // addv.16b b0, v0 WORD $0x1e26000a // fmov w10, s0 WORD $0xb400030b // cbz x11, LBB0_15 WORD $0xf100217f // cmp x11, #8 WORD $0x54000203 // b.lo LBB0_13 BB0_10: WORD $0xaa0903ed // mov x13, x9 WORD $0x9240084b // and x11, x2, #0x7 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0x4e011d40 // mov.b v0[0], w10 WORD $0x8b0b01aa // add x10, x13, x11 WORD $0xcb08014a // sub x10, x10, x8 BB0_11: WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x0e208420 // add.8b v0, v1, v0 WORD $0xb100214a // adds x10, x10, #8 WORD $0x54ffffa1 // b.ne LBB0_11 WORD $0x0e31b800 // addv.8b b0, v0 WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, LBB0_15 BB0_13: WORD $0x8b09000b // add x11, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 BB0_14: WORD $0x38401569 // ldrb w9, [x11], #1 WORD $0x0b0a012a // add w10, w9, w10 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB0_14 BB0_15: WORD $0x3900002a // strb w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint8_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x39400009 // ldrb w9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400064b // b.lt LBB1_14 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB1_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000025 // b LBB1_12 BB1_3: WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs LBB1_5 WORD $0xd280000a // mov x10, #0 WORD $0x14000013 // b LBB1_9 BB1_5: WORD $0x9240104b // and x11, x2, #0x1f WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x4e010d20 // dup.16b v0, w9 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB1_6: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x6e206c40 // umin.16b v0, v2, v0 WORD $0x6e216c61 // umin.16b v1, v3, v1 WORD $0x91008129 // add x9, x9, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x54ffff61 // b.ne LBB1_6 WORD $0x6e216c00 // umin.16b v0, v0, v1 WORD $0x6e31a800 // uminv.16b b0, v0 WORD $0x1e260009 // fmov w9, s0 WORD $0xb400032b // cbz x11, LBB1_14 WORD $0xf100217f // cmp x11, #8 WORD $0x540001e3 // b.lo LBB1_12 BB1_9: WORD $0xaa0a03ed // mov x13, x10 WORD $0x9240084b // and x11, x2, #0x7 WORD $0x8b0a000c // add x12, x0, x10 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x0e010d20 // dup.8b v0, w9 WORD $0x8b0b01a9 // add x9, x13, x11 WORD $0xcb080129 // sub x9, x9, x8 BB1_10: WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x2e206c20 // umin.8b v0, v1, v0 WORD $0xb1002129 // adds x9, x9, #8 WORD $0x54ffffa1 // b.ne LBB1_10 WORD $0x2e31a800 // uminv.8b b0, v0 WORD $0x1e260009 // fmov w9, s0 WORD $0xb400012b // cbz x11, LBB1_14 BB1_12: WORD $0x8b0a000b // add x11, x0, x10 WORD $0xcb0a0108 // sub x8, x8, x10 BB1_13: WORD $0x3840156a // ldrb w10, [x11], #1 WORD $0x12001d29 // and w9, w9, #0xff WORD $0x6b09015f // cmp w10, w9 WORD $0x1a893149 // csel w9, w10, w9, lo WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB1_13 BB1_14: WORD $0x39000029 // strb w9, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint8_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x39400009 // ldrb w9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400064b // b.lt LBB2_14 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB2_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000025 // b LBB2_12 BB2_3: WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs LBB2_5 WORD $0xd280000a // mov x10, #0 WORD $0x14000013 // b LBB2_9 BB2_5: WORD $0x9240104b // and x11, x2, #0x1f WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x4e010d20 // dup.16b v0, w9 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB2_6: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x6e206440 // umax.16b v0, v2, v0 WORD $0x6e216461 // umax.16b v1, v3, v1 WORD $0x91008129 // add x9, x9, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x54ffff61 // b.ne LBB2_6 WORD $0x6e216400 // umax.16b v0, v0, v1 WORD $0x6e30a800 // umaxv.16b b0, v0 WORD $0x1e260009 // fmov w9, s0 WORD $0xb400032b // cbz x11, LBB2_14 WORD $0xf100217f // cmp x11, #8 WORD $0x540001e3 // b.lo LBB2_12 BB2_9: WORD $0xaa0a03ed // mov x13, x10 WORD $0x9240084b // and x11, x2, #0x7 WORD $0x8b0a000c // add x12, x0, x10 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x0e010d20 // dup.8b v0, w9 WORD $0x8b0b01a9 // add x9, x13, x11 WORD $0xcb080129 // sub x9, x9, x8 BB2_10: WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x2e206420 // umax.8b v0, v1, v0 WORD $0xb1002129 // adds x9, x9, #8 WORD $0x54ffffa1 // b.ne LBB2_10 WORD $0x2e30a800 // umaxv.8b b0, v0 WORD $0x1e260009 // fmov w9, s0 WORD $0xb400012b // cbz x11, LBB2_14 BB2_12: WORD $0x8b0a000b // add x11, x0, x10 WORD $0xcb0a0108 // sub x8, x8, x10 BB2_13: WORD $0x3840156a // ldrb w10, [x11], #1 WORD $0x12001d29 // and w9, w9, #0xff WORD $0x6b09015f // cmp w10, w9 WORD $0x1a898149 // csel w9, w10, w9, hi WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB2_13 BB2_14: WORD $0x39000029 // strb w9, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint8_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt LBB3_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs LBB3_6 WORD $0xd2800009 // mov x9, #0 BB3_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 BB3_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0x38001549 // strb w9, [x10], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB3_4 BB3_5: WORD $0xd65f03c0 // ret BB3_6: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo LBB3_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo LBB3_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs LBB3_10 WORD $0xd2800009 // mov x9, #0 WORD $0x14000014 // b LBB3_14 BB3_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB3_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4e208440 // add.16b v0, v2, v0 WORD $0x4e218461 // add.16b v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0x54fffee1 // b.ne LBB3_11 WORD $0xb4fffc8a // cbz x10, LBB3_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo LBB3_3 BB3_14: WORD $0xaa0903ee // mov x14, x9 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x8b0a01ce // add x14, x14, x10 WORD $0xcb0801ce // sub x14, x14, x8 BB3_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x0e208420 // add.8b v0, v1, v0 WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x54ffff61 // b.ne LBB3_15 WORD $0xb5fff92a // cbnz x10, LBB3_3 WORD $0x17ffffd2 // b LBB3_5 TEXT ·_uint8_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt LBB4_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs LBB4_6 WORD $0xd2800009 // mov x9, #0 BB4_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 BB4_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0x38001549 // strb w9, [x10], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB4_4 BB4_5: WORD $0xd65f03c0 // ret BB4_6: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo LBB4_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo LBB4_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs LBB4_10 WORD $0xd2800009 // mov x9, #0 WORD $0x14000014 // b LBB4_14 BB4_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB4_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6e228400 // sub.16b v0, v0, v2 WORD $0x6e238421 // sub.16b v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0x54fffee1 // b.ne LBB4_11 WORD $0xb4fffc8a // cbz x10, LBB4_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo LBB4_3 BB4_14: WORD $0xaa0903ee // mov x14, x9 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x8b0a01ce // add x14, x14, x10 WORD $0xcb0801ce // sub x14, x14, x8 BB4_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x2e218400 // sub.8b v0, v0, v1 WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x54ffff61 // b.ne LBB4_15 WORD $0xb5fff92a // cbnz x10, LBB4_3 WORD $0x17ffffd2 // b LBB4_5 TEXT ·_uint8_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt LBB5_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs LBB5_6 WORD $0xd2800009 // mov x9, #0 BB5_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 BB5_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0x38001549 // strb w9, [x10], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB5_4 BB5_5: WORD $0xd65f03c0 // ret BB5_6: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo LBB5_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo LBB5_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs LBB5_10 WORD $0xd2800009 // mov x9, #0 WORD $0x14000014 // b LBB5_14 BB5_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB5_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4e209c40 // mul.16b v0, v2, v0 WORD $0x4e219c61 // mul.16b v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0x54fffee1 // b.ne LBB5_11 WORD $0xb4fffc8a // cbz x10, LBB5_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo LBB5_3 BB5_14: WORD $0xaa0903ee // mov x14, x9 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x8b0a01ce // add x14, x14, x10 WORD $0xcb0801ce // sub x14, x14, x8 BB5_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x0e209c20 // mul.8b v0, v1, v0 WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x54ffff61 // b.ne LBB5_15 WORD $0xb5fff92a // cbnz x10, LBB5_3 WORD $0x17ffffd2 // b LBB5_5 TEXT ·_uint8_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x54000c6b // b.lt LBB6_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs LBB6_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000054 // b LBB6_8 BB6_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000a03 // b.lo LBB6_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540009a3 // b.lo LBB6_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0903eb // mov x11, x9 WORD $0xaa0203ec // mov x12, x2 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0003ee // mov x14, x0 BB6_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0x0e033c2f // umov.b w15, v1[1] WORD $0x0e033c10 // umov.b w16, v0[1] WORD $0x0e013c31 // umov.b w17, v1[0] WORD $0x1acf0a0f // udiv w15, w16, w15 WORD $0x0e013c10 // umov.b w16, v0[0] WORD $0x1ad10a10 // udiv w16, w16, w17 WORD $0x0e053c31 // umov.b w17, v1[2] WORD $0x1e270202 // fmov s2, w16 WORD $0x0e053c10 // umov.b w16, v0[2] WORD $0x1ad10a10 // udiv w16, w16, w17 WORD $0x4e031de2 // mov.b v2[1], w15 WORD $0x0e073c2f // umov.b w15, v1[3] WORD $0x0e073c11 // umov.b w17, v0[3] WORD $0x4e051e02 // mov.b v2[2], w16 WORD $0x1acf0a2f // udiv w15, w17, w15 WORD $0x0e093c30 // umov.b w16, v1[4] WORD $0x0e093c11 // umov.b w17, v0[4] WORD $0x1ad00a30 // udiv w16, w17, w16 WORD $0x0e0b3c31 // umov.b w17, v1[5] WORD $0x0e0b3c03 // umov.b w3, v0[5] WORD $0x4e071de2 // mov.b v2[3], w15 WORD $0x0e0d3c2f // umov.b w15, v1[6] WORD $0x0e0d3c04 // umov.b w4, v0[6] WORD $0x4e091e02 // mov.b v2[4], w16 WORD $0x1ad10870 // udiv w16, w3, w17 WORD $0x1acf088f // udiv w15, w4, w15 WORD $0x0e0f3c31 // umov.b w17, v1[7] WORD $0x4e0b1e02 // mov.b v2[5], w16 WORD $0x0e0f3c10 // umov.b w16, v0[7] WORD $0x0e113c23 // umov.b w3, v1[8] WORD $0x4e0d1de2 // mov.b v2[6], w15 WORD $0x1ad10a0f // udiv w15, w16, w17 WORD $0x0e113c10 // umov.b w16, v0[8] WORD $0x1ac30a10 // udiv w16, w16, w3 WORD $0x4e0f1de2 // mov.b v2[7], w15 WORD $0x0e133c2f // umov.b w15, v1[9] WORD $0x0e133c11 // umov.b w17, v0[9] WORD $0x4e111e02 // mov.b v2[8], w16 WORD $0x1acf0a2f // udiv w15, w17, w15 WORD $0x0e153c30 // umov.b w16, v1[10] WORD $0x0e153c11 // umov.b w17, v0[10] WORD $0x1ad00a30 // udiv w16, w17, w16 WORD $0x0e173c31 // umov.b w17, v1[11] WORD $0x0e173c03 // umov.b w3, v0[11] WORD $0x4e131de2 // mov.b v2[9], w15 WORD $0x0e193c2f // umov.b w15, v1[12] WORD $0x0e193c04 // umov.b w4, v0[12] WORD $0x4e151e02 // mov.b v2[10], w16 WORD $0x1ad10870 // udiv w16, w3, w17 WORD $0x1acf088f // udiv w15, w4, w15 WORD $0x0e1b3c31 // umov.b w17, v1[13] WORD $0x4e171e02 // mov.b v2[11], w16 WORD $0x0e1b3c10 // umov.b w16, v0[13] WORD $0x0e1d3c23 // umov.b w3, v1[14] WORD $0x4e191de2 // mov.b v2[12], w15 WORD $0x1ad10a0f // udiv w15, w16, w17 WORD $0x0e1d3c10 // umov.b w16, v0[14] WORD $0x1ac30a10 // udiv w16, w16, w3 WORD $0x4e1b1de2 // mov.b v2[13], w15 WORD $0x0e1f3c2f // umov.b w15, v1[15] WORD $0x0e1f3c11 // umov.b w17, v0[15] WORD $0x4e1d1e02 // mov.b v2[14], w16 WORD $0x1acf0a2f // udiv w15, w17, w15 WORD $0x4e1f1de2 // mov.b v2[15], w15 WORD $0x3c810582 // str q2, [x12], #16 WORD $0xf100416b // subs x11, x11, #16 WORD $0x54fff781 // b.ne LBB6_6 WORD $0xb400016a // cbz x10, LBB6_10 BB6_8: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 BB6_9: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0x1acd0929 // udiv w9, w9, w13 WORD $0x38001549 // strb w9, [x10], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB6_9 BB6_10: WORD $0xd65f03c0 // ret TEXT ·_uint16_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt LBB7_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x540000c2 // b.hs LBB7_4 WORD $0xd2800009 // mov x9, #0 WORD $0x5280000a // mov w10, #0 WORD $0x14000013 // b LBB7_7 BB7_3: WORD $0x7900003f // strh wzr, [x1] WORD $0xd65f03c0 // ret BB7_4: WORD $0x92400c4b // and x11, x2, #0xf WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x9100400a // add x10, x0, #16 WORD $0x6f00e400 // movi.2d v0, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi.2d v1, #0000000000000000 BB7_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x4e608440 // add.8h v0, v2, v0 WORD $0x4e618461 // add.8h v1, v3, v1 WORD $0x9100814a // add x10, x10, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x54ffff61 // b.ne LBB7_5 WORD $0x4e608420 // add.8h v0, v1, v0 WORD $0x4e71b800 // addv.8h h0, v0 WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, LBB7_9 BB7_7: WORD $0x8b09040b // add x11, x0, x9, lsl #1 WORD $0xcb090108 // sub x8, x8, x9 BB7_8: WORD $0x78402569 // ldrh w9, [x11], #2 WORD $0x0b0a012a // add w10, w9, w10 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB7_8 BB7_9: WORD $0x7900002a // strh w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint16_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x79400008 // ldrh w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003cb // b.lt LBB8_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100413f // cmp x9, #16 WORD $0x54000062 // b.hs LBB8_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000011 // b LBB8_6 BB8_3: WORD $0x92400c4b // and x11, x2, #0xf WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x4e020d00 // dup.8h v0, w8 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB8_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x6e606c40 // umin.8h v0, v2, v0 WORD $0x6e616c61 // umin.8h v1, v3, v1 WORD $0x91008108 // add x8, x8, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x54ffff61 // b.ne LBB8_4 WORD $0x6e616c00 // umin.8h v0, v0, v1 WORD $0x6e71a800 // uminv.8h h0, v0 WORD $0x1e260008 // fmov w8, s0 WORD $0xb400012b // cbz x11, LBB8_8 BB8_6: WORD $0x8b0a040b // add x11, x0, x10, lsl #1 WORD $0xcb0a0129 // sub x9, x9, x10 BB8_7: WORD $0x7840256a // ldrh w10, [x11], #2 WORD $0x12003d08 // and w8, w8, #0xffff WORD $0x6b08015f // cmp w10, w8 WORD $0x1a883148 // csel w8, w10, w8, lo WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff61 // b.ne LBB8_7 BB8_8: WORD $0x79000028 // strh w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint16_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x79400008 // ldrh w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003cb // b.lt LBB9_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100413f // cmp x9, #16 WORD $0x54000062 // b.hs LBB9_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000011 // b LBB9_6 BB9_3: WORD $0x92400c4b // and x11, x2, #0xf WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x4e020d00 // dup.8h v0, w8 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB9_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x6e606440 // umax.8h v0, v2, v0 WORD $0x6e616461 // umax.8h v1, v3, v1 WORD $0x91008108 // add x8, x8, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x54ffff61 // b.ne LBB9_4 WORD $0x6e616400 // umax.8h v0, v0, v1 WORD $0x6e70a800 // umaxv.8h h0, v0 WORD $0x1e260008 // fmov w8, s0 WORD $0xb400012b // cbz x11, LBB9_8 BB9_6: WORD $0x8b0a040b // add x11, x0, x10, lsl #1 WORD $0xcb0a0129 // sub x9, x9, x10 BB9_7: WORD $0x7840256a // ldrh w10, [x11], #2 WORD $0x12003d08 // and w8, w8, #0xffff WORD $0x6b08015f // cmp w10, w8 WORD $0x1a888148 // csel w8, w10, w8, hi WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff61 // b.ne LBB9_7 BB9_8: WORD $0x79000028 // strh w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint16_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB10_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs LBB10_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB10_8 BB10_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB10_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB10_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB10_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4e608440 // add.8h v0, v2, v0 WORD $0x4e618461 // add.8h v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0x54fffee1 // b.ne LBB10_6 WORD $0xb400018a // cbz x10, LBB10_10 BB10_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB10_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0x78002549 // strh w9, [x10], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB10_9 BB10_10: WORD $0xd65f03c0 // ret TEXT ·_uint16_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB11_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs LBB11_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB11_8 BB11_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB11_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB11_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB11_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6e628400 // sub.8h v0, v0, v2 WORD $0x6e638421 // sub.8h v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0x54fffee1 // b.ne LBB11_6 WORD $0xb400018a // cbz x10, LBB11_10 BB11_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB11_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0x78002549 // strh w9, [x10], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB11_9 BB11_10: WORD $0xd65f03c0 // ret TEXT ·_uint16_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB12_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs LBB12_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB12_8 BB12_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB12_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB12_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB12_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4e609c40 // mul.8h v0, v2, v0 WORD $0x4e619c61 // mul.8h v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0x54fffee1 // b.ne LBB12_6 WORD $0xb400018a // cbz x10, LBB12_10 BB12_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB12_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0x78002549 // strh w9, [x10], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB12_9 BB12_10: WORD $0xd65f03c0 // ret TEXT ·_uint16_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400088b // b.lt LBB13_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB13_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000034 // b LBB13_8 BB13_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000603 // b.lo LBB13_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540005a3 // b.lo LBB13_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0903eb // mov x11, x9 WORD $0xaa0203ec // mov x12, x2 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0003ee // mov x14, x0 BB13_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0x0e063c2f // umov.h w15, v1[1] WORD $0x0e063c10 // umov.h w16, v0[1] WORD $0x0e023c31 // umov.h w17, v1[0] WORD $0x0e023c03 // umov.h w3, v0[0] WORD $0x1ad10871 // udiv w17, w3, w17 WORD $0x1acf0a0f // udiv w15, w16, w15 WORD $0x0e0a3c30 // umov.h w16, v1[2] WORD $0x0e0a3c03 // umov.h w3, v0[2] WORD $0x1ad00870 // udiv w16, w3, w16 WORD $0x1e270222 // fmov s2, w17 WORD $0x0e0e3c31 // umov.h w17, v1[3] WORD $0x0e0e3c03 // umov.h w3, v0[3] WORD $0x4e061de2 // mov.h v2[1], w15 WORD $0x0e123c2f // umov.h w15, v1[4] WORD $0x0e123c04 // umov.h w4, v0[4] WORD $0x4e0a1e02 // mov.h v2[2], w16 WORD $0x1ad10870 // udiv w16, w3, w17 WORD $0x1acf088f // udiv w15, w4, w15 WORD $0x0e163c31 // umov.h w17, v1[5] WORD $0x4e0e1e02 // mov.h v2[3], w16 WORD $0x0e163c10 // umov.h w16, v0[5] WORD $0x0e1a3c23 // umov.h w3, v1[6] WORD $0x4e121de2 // mov.h v2[4], w15 WORD $0x1ad10a0f // udiv w15, w16, w17 WORD $0x0e1a3c10 // umov.h w16, v0[6] WORD $0x1ac30a10 // udiv w16, w16, w3 WORD $0x4e161de2 // mov.h v2[5], w15 WORD $0x0e1e3c2f // umov.h w15, v1[7] WORD $0x0e1e3c11 // umov.h w17, v0[7] WORD $0x4e1a1e02 // mov.h v2[6], w16 WORD $0x1acf0a2f // udiv w15, w17, w15 WORD $0x4e1e1de2 // mov.h v2[7], w15 WORD $0x3c810582 // str q2, [x12], #16 WORD $0xf100216b // subs x11, x11, #8 WORD $0x54fffb81 // b.ne LBB13_6 WORD $0xb400018a // cbz x10, LBB13_10 BB13_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB13_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0x1acd0929 // udiv w9, w9, w13 WORD $0x78002549 // strh w9, [x10], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB13_9 BB13_10: WORD $0xd65f03c0 // ret TEXT ·_uint32_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt LBB14_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540000c2 // b.hs LBB14_4 WORD $0xd2800009 // mov x9, #0 WORD $0x5280000a // mov w10, #0 WORD $0x14000013 // b LBB14_7 BB14_3: WORD $0xb900003f // str wzr, [x1] WORD $0xd65f03c0 // ret BB14_4: WORD $0x9240084b // and x11, x2, #0x7 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x9100400a // add x10, x0, #16 WORD $0x6f00e400 // movi.2d v0, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi.2d v1, #0000000000000000 BB14_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x4ea08440 // add.4s v0, v2, v0 WORD $0x4ea18461 // add.4s v1, v3, v1 WORD $0x9100814a // add x10, x10, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x54ffff61 // b.ne LBB14_5 WORD $0x4ea08420 // add.4s v0, v1, v0 WORD $0x4eb1b800 // addv.4s s0, v0 WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, LBB14_9 BB14_7: WORD $0x8b09080b // add x11, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 BB14_8: WORD $0xb8404569 // ldr w9, [x11], #4 WORD $0x0b0a012a // add w10, w9, w10 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB14_8 BB14_9: WORD $0xb900002a // str w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint32_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xb9400008 // ldr w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003ab // b.lt LBB15_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100213f // cmp x9, #8 WORD $0x54000062 // b.hs LBB15_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000011 // b LBB15_6 BB15_3: WORD $0x9240084b // and x11, x2, #0x7 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x4e040d00 // dup.4s v0, w8 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB15_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x6ea06c40 // umin.4s v0, v2, v0 WORD $0x6ea16c61 // umin.4s v1, v3, v1 WORD $0x91008108 // add x8, x8, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x54ffff61 // b.ne LBB15_4 WORD $0x6ea16c00 // umin.4s v0, v0, v1 WORD $0x6eb1a800 // uminv.4s s0, v0 WORD $0x1e260008 // fmov w8, s0 WORD $0xb400010b // cbz x11, LBB15_8 BB15_6: WORD $0x8b0a080b // add x11, x0, x10, lsl #2 WORD $0xcb0a0129 // sub x9, x9, x10 BB15_7: WORD $0xb840456a // ldr w10, [x11], #4 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a883148 // csel w8, w10, w8, lo WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff81 // b.ne LBB15_7 BB15_8: WORD $0xb9000028 // str w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint32_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xb9400008 // ldr w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003ab // b.lt LBB16_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100213f // cmp x9, #8 WORD $0x54000062 // b.hs LBB16_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000011 // b LBB16_6 BB16_3: WORD $0x9240084b // and x11, x2, #0x7 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x4e040d00 // dup.4s v0, w8 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB16_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x6ea06440 // umax.4s v0, v2, v0 WORD $0x6ea16461 // umax.4s v1, v3, v1 WORD $0x91008108 // add x8, x8, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x54ffff61 // b.ne LBB16_4 WORD $0x6ea16400 // umax.4s v0, v0, v1 WORD $0x6eb0a800 // umaxv.4s s0, v0 WORD $0x1e260008 // fmov w8, s0 WORD $0xb400010b // cbz x11, LBB16_8 BB16_6: WORD $0x8b0a080b // add x11, x0, x10, lsl #2 WORD $0xcb0a0129 // sub x9, x9, x10 BB16_7: WORD $0xb840456a // ldr w10, [x11], #4 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a888148 // csel w8, w10, w8, hi WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff81 // b.ne LBB16_7 BB16_8: WORD $0xb9000028 // str w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint32_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB17_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB17_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB17_8 BB17_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB17_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB17_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB17_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4ea08440 // add.4s v0, v2, v0 WORD $0x4ea18461 // add.4s v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0x54fffee1 // b.ne LBB17_6 WORD $0xb400018a // cbz x10, LBB17_10 BB17_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB17_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0xb8004549 // str w9, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB17_9 BB17_10: WORD $0xd65f03c0 // ret TEXT ·_uint32_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB18_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB18_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB18_8 BB18_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB18_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB18_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB18_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6ea28400 // sub.4s v0, v0, v2 WORD $0x6ea38421 // sub.4s v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0x54fffee1 // b.ne LBB18_6 WORD $0xb400018a // cbz x10, LBB18_10 BB18_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB18_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0xb8004549 // str w9, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB18_9 BB18_10: WORD $0xd65f03c0 // ret TEXT ·_uint32_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB19_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB19_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB19_8 BB19_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB19_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB19_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB19_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4ea09c40 // mul.4s v0, v2, v0 WORD $0x4ea19c61 // mul.4s v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0x54fffee1 // b.ne LBB19_6 WORD $0xb400018a // cbz x10, LBB19_10 BB19_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB19_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0xb8004549 // str w9, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB19_9 BB19_10: WORD $0xd65f03c0 // ret TEXT ·_uint32_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400068b // b.lt LBB20_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB20_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000024 // b LBB20_8 BB20_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000403 // b.lo LBB20_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540003a3 // b.lo LBB20_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0903eb // mov x11, x9 WORD $0xaa0203ec // mov x12, x2 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0003ee // mov x14, x0 BB20_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0x0e0c3c2f // mov.s w15, v1[1] WORD $0x0e0c3c10 // mov.s w16, v0[1] WORD $0x0e143c31 // mov.s w17, v1[2] WORD $0x0e143c03 // mov.s w3, v0[2] WORD $0x0e1c3c24 // mov.s w4, v1[3] WORD $0x1e260025 // fmov w5, s1 WORD $0x0e1c3c06 // mov.s w6, v0[3] WORD $0x1e260007 // fmov w7, s0 WORD $0x1ac508e5 // udiv w5, w7, w5 WORD $0x1acf0a0f // udiv w15, w16, w15 WORD $0x1e2700a0 // fmov s0, w5 WORD $0x4e0c1de0 // mov.s v0[1], w15 WORD $0x1ad1086f // udiv w15, w3, w17 WORD $0x4e141de0 // mov.s v0[2], w15 WORD $0x1ac408cf // udiv w15, w6, w4 WORD $0x4e1c1de0 // mov.s v0[3], w15 WORD $0x3c810580 // str q0, [x12], #16 WORD $0xf100116b // subs x11, x11, #4 WORD $0x54fffd81 // b.ne LBB20_6 WORD $0xb400018a // cbz x10, LBB20_10 BB20_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB20_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0x1acd0929 // udiv w9, w9, w13 WORD $0xb8004549 // str w9, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB20_9 BB20_10: WORD $0xd65f03c0 // ret TEXT ·_uint64_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt LBB21_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x540000c2 // b.hs LBB21_4 WORD $0xd2800009 // mov x9, #0 WORD $0xd280000a // mov x10, #0 WORD $0x14000013 // b LBB21_7 BB21_3: WORD $0xf900003f // str xzr, [x1] WORD $0xd65f03c0 // ret BB21_4: WORD $0x9240044b // and x11, x2, #0x3 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x9100400a // add x10, x0, #16 WORD $0x6f00e400 // movi.2d v0, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi.2d v1, #0000000000000000 BB21_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x4ee08440 // add.2d v0, v2, v0 WORD $0x4ee18461 // add.2d v1, v3, v1 WORD $0x9100814a // add x10, x10, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x54ffff61 // b.ne LBB21_5 WORD $0x4ee08420 // add.2d v0, v1, v0 WORD $0x5ef1b800 // addp.2d d0, v0 WORD $0x9e66000a // fmov x10, d0 WORD $0xb40000eb // cbz x11, LBB21_9 BB21_7: WORD $0x8b090c0b // add x11, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 BB21_8: WORD $0xf8408569 // ldr x9, [x11], #8 WORD $0x8b0a012a // add x10, x9, x10 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB21_8 BB21_9: WORD $0xf900002a // str x10, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint64_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xf9400009 // ldr x9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400044b // b.lt LBB22_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB22_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000016 // b LBB22_6 BB22_3: WORD $0x9240044b // and x11, x2, #0x3 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x4e080d20 // dup.2d v0, x9 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB22_4: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x6ee23404 // cmhi.2d v4, v0, v2 WORD $0x6ea41c40 // bit.16b v0, v2, v4 WORD $0x6ee33422 // cmhi.2d v2, v1, v3 WORD $0x6ea21c61 // bit.16b v1, v3, v2 WORD $0x91008129 // add x9, x9, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x54ffff21 // b.ne LBB22_4 WORD $0x6ee03422 // cmhi.2d v2, v1, v0 WORD $0x6ee21c20 // bif.16b v0, v1, v2 WORD $0x6e004001 // ext.16b v1, v0, v0, #8 WORD $0x7ee03422 // cmhi d2, d1, d0 WORD $0x2ee21c20 // bif.8b v0, v1, v2 WORD $0x9e660009 // fmov x9, d0 WORD $0xb400010b // cbz x11, LBB22_8 BB22_6: WORD $0x8b0a0c0b // add x11, x0, x10, lsl #3 WORD $0xcb0a0108 // sub x8, x8, x10 BB22_7: WORD $0xf840856a // ldr x10, [x11], #8 WORD $0xeb09015f // cmp x10, x9 WORD $0x9a893149 // csel x9, x10, x9, lo WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff81 // b.ne LBB22_7 BB22_8: WORD $0xf9000029 // str x9, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint64_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xf9400009 // ldr x9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400044b // b.lt LBB23_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB23_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000016 // b LBB23_6 BB23_3: WORD $0x9240044b // and x11, x2, #0x3 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x4e080d20 // dup.2d v0, x9 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB23_4: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x6ee03444 // cmhi.2d v4, v2, v0 WORD $0x6ea41c40 // bit.16b v0, v2, v4 WORD $0x6ee13462 // cmhi.2d v2, v3, v1 WORD $0x6ea21c61 // bit.16b v1, v3, v2 WORD $0x91008129 // add x9, x9, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x54ffff21 // b.ne LBB23_4 WORD $0x6ee13402 // cmhi.2d v2, v0, v1 WORD $0x6ee21c20 // bif.16b v0, v1, v2 WORD $0x6e004001 // ext.16b v1, v0, v0, #8 WORD $0x7ee13402 // cmhi d2, d0, d1 WORD $0x2ee21c20 // bif.8b v0, v1, v2 WORD $0x9e660009 // fmov x9, d0 WORD $0xb400010b // cbz x11, LBB23_8 BB23_6: WORD $0x8b0a0c0b // add x11, x0, x10, lsl #3 WORD $0xcb0a0108 // sub x8, x8, x10 BB23_7: WORD $0xf840856a // ldr x10, [x11], #8 WORD $0xeb09015f // cmp x10, x9 WORD $0x9a898149 // csel x9, x10, x9, hi WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff81 // b.ne LBB23_7 BB23_8: WORD $0xf9000029 // str x9, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint64_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB24_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB24_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB24_8 BB24_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB24_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB24_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB24_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4ee08440 // add.2d v0, v2, v0 WORD $0x4ee18461 // add.2d v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x54fffee1 // b.ne LBB24_6 WORD $0xb400018a // cbz x10, LBB24_10 BB24_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB24_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0x8b0901a9 // add x9, x13, x9 WORD $0xf8008549 // str x9, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB24_9 BB24_10: WORD $0xd65f03c0 // ret TEXT ·_uint64_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB25_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB25_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB25_8 BB25_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB25_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB25_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB25_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6ee28400 // sub.2d v0, v0, v2 WORD $0x6ee38421 // sub.2d v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x54fffee1 // b.ne LBB25_6 WORD $0xb400018a // cbz x10, LBB25_10 BB25_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB25_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0xcb0d0129 // sub x9, x9, x13 WORD $0xf8008549 // str x9, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB25_9 BB25_10: WORD $0xd65f03c0 // ret TEXT ·_uint64_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540006eb // b.lt LBB26_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB26_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000027 // b LBB26_8 BB26_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54000463 // b.lo LBB26_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000403 // b.lo LBB26_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB26_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x3cdf0182 // ldur q2, [x12, #-16] WORD $0x4e183c0f // mov.d x15, v0[1] WORD $0x4e183c50 // mov.d x16, v2[1] WORD $0x9b0f7e0f // mul x15, x16, x15 WORD $0x9e660010 // fmov x16, d0 WORD $0x9e660051 // fmov x17, d2 WORD $0x9b107e30 // mul x16, x17, x16 WORD $0x9e670200 // fmov d0, x16 WORD $0x4e181de0 // mov.d v0[1], x15 WORD $0x3cc20582 // ldr q2, [x12], #32 WORD $0x4e183c2f // mov.d x15, v1[1] WORD $0x4e183c50 // mov.d x16, v2[1] WORD $0x9b0f7e0f // mul x15, x16, x15 WORD $0x9e660030 // fmov x16, d1 WORD $0x9e660051 // fmov x17, d2 WORD $0x9b107e30 // mul x16, x17, x16 WORD $0x9e670201 // fmov d1, x16 WORD $0x4e181de1 // mov.d v1[1], x15 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x54fffd21 // b.ne LBB26_6 WORD $0xb400018a // cbz x10, LBB26_10 BB26_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB26_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0x9b097da9 // mul x9, x13, x9 WORD $0xf8008549 // str x9, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB26_9 BB26_10: WORD $0xd65f03c0 // ret TEXT ·_uint64_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400058b // b.lt LBB27_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100091f // cmp x8, #2 WORD $0x54000062 // b.hs LBB27_3 WORD $0xd2800009 // mov x9, #0 WORD $0x1400001c // b LBB27_8 BB27_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000303 // b.lo LBB27_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540002a3 // b.lo LBB27_8 WORD $0x9240006a // and x10, x3, #0x1 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0903eb // mov x11, x9 WORD $0xaa0203ec // mov x12, x2 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0003ee // mov x14, x0 BB27_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0x4e183c2f // mov.d x15, v1[1] WORD $0x4e183c10 // mov.d x16, v0[1] WORD $0x9acf0a0f // udiv x15, x16, x15 WORD $0x9e660030 // fmov x16, d1 WORD $0x9e660011 // fmov x17, d0 WORD $0x9ad00a30 // udiv x16, x17, x16 WORD $0x9e670200 // fmov d0, x16 WORD $0x4e181de0 // mov.d v0[1], x15 WORD $0x3c810580 // str q0, [x12], #16 WORD $0xf100096b // subs x11, x11, #2 WORD $0x54fffe81 // b.ne LBB27_6 WORD $0xb400018a // cbz x10, LBB27_10 BB27_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB27_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0x9acd0929 // udiv x9, x9, x13 WORD $0xf8008549 // str x9, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB27_9 BB27_10: WORD $0xd65f03c0 // ret TEXT ·_int8_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt LBB28_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540000c2 // b.hs LBB28_4 WORD $0xd2800009 // mov x9, #0 WORD $0x5280000a // mov w10, #0 WORD $0x14000029 // b LBB28_13 BB28_3: WORD $0x3900003f // strb wzr, [x1] WORD $0xd65f03c0 // ret BB28_4: WORD $0xf100811f // cmp x8, #32 WORD $0x54000082 // b.hs LBB28_6 WORD $0x5280000a // mov w10, #0 WORD $0xd2800009 // mov x9, #0 WORD $0x14000013 // b LBB28_10 BB28_6: WORD $0x9240104b // and x11, x2, #0x1f WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x9100400a // add x10, x0, #16 WORD $0x6f00e400 // movi.2d v0, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi.2d v1, #0000000000000000 BB28_7: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x4e208440 // add.16b v0, v2, v0 WORD $0x4e218461 // add.16b v1, v3, v1 WORD $0x9100814a // add x10, x10, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x54ffff61 // b.ne LBB28_7 WORD $0x4e208420 // add.16b v0, v1, v0 WORD $0x4e31b800 // addv.16b b0, v0 WORD $0x1e26000a // fmov w10, s0 WORD $0xb400030b // cbz x11, LBB28_15 WORD $0xf100217f // cmp x11, #8 WORD $0x54000203 // b.lo LBB28_13 BB28_10: WORD $0xaa0903ed // mov x13, x9 WORD $0x9240084b // and x11, x2, #0x7 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0x4e011d40 // mov.b v0[0], w10 WORD $0x8b0b01aa // add x10, x13, x11 WORD $0xcb08014a // sub x10, x10, x8 BB28_11: WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x0e208420 // add.8b v0, v1, v0 WORD $0xb100214a // adds x10, x10, #8 WORD $0x54ffffa1 // b.ne LBB28_11 WORD $0x0e31b800 // addv.8b b0, v0 WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, LBB28_15 BB28_13: WORD $0x8b09000b // add x11, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 BB28_14: WORD $0x38401569 // ldrb w9, [x11], #1 WORD $0x0b0a012a // add w10, w9, w10 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB28_14 BB28_15: WORD $0x3900002a // strb w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_int8_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x39400009 // ldrb w9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400064b // b.lt LBB29_14 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB29_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000025 // b LBB29_12 BB29_3: WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs LBB29_5 WORD $0xd280000a // mov x10, #0 WORD $0x14000013 // b LBB29_9 BB29_5: WORD $0x9240104b // and x11, x2, #0x1f WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x4e010d20 // dup.16b v0, w9 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB29_6: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x4e206c40 // smin.16b v0, v2, v0 WORD $0x4e216c61 // smin.16b v1, v3, v1 WORD $0x91008129 // add x9, x9, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x54ffff61 // b.ne LBB29_6 WORD $0x4e216c00 // smin.16b v0, v0, v1 WORD $0x4e31a800 // sminv.16b b0, v0 WORD $0x1e260009 // fmov w9, s0 WORD $0xb400032b // cbz x11, LBB29_14 WORD $0xf100217f // cmp x11, #8 WORD $0x540001e3 // b.lo LBB29_12 BB29_9: WORD $0xaa0a03ed // mov x13, x10 WORD $0x9240084b // and x11, x2, #0x7 WORD $0x8b0a000c // add x12, x0, x10 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x0e010d20 // dup.8b v0, w9 WORD $0x8b0b01a9 // add x9, x13, x11 WORD $0xcb080129 // sub x9, x9, x8 BB29_10: WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x0e206c20 // smin.8b v0, v1, v0 WORD $0xb1002129 // adds x9, x9, #8 WORD $0x54ffffa1 // b.ne LBB29_10 WORD $0x0e31a800 // sminv.8b b0, v0 WORD $0x1e260009 // fmov w9, s0 WORD $0xb400012b // cbz x11, LBB29_14 BB29_12: WORD $0x8b0a000b // add x11, x0, x10 WORD $0xcb0a0108 // sub x8, x8, x10 BB29_13: WORD $0x38c0156a // ldrsb w10, [x11], #1 WORD $0x13001d29 // sxtb w9, w9 WORD $0x6b09015f // cmp w10, w9 WORD $0x1a89b149 // csel w9, w10, w9, lt WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB29_13 BB29_14: WORD $0x39000029 // strb w9, [x1] WORD $0xd65f03c0 // ret TEXT ·_int8_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x39400009 // ldrb w9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400064b // b.lt LBB30_14 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB30_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000025 // b LBB30_12 BB30_3: WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs LBB30_5 WORD $0xd280000a // mov x10, #0 WORD $0x14000013 // b LBB30_9 BB30_5: WORD $0x9240104b // and x11, x2, #0x1f WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x4e010d20 // dup.16b v0, w9 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB30_6: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x4e206440 // smax.16b v0, v2, v0 WORD $0x4e216461 // smax.16b v1, v3, v1 WORD $0x91008129 // add x9, x9, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x54ffff61 // b.ne LBB30_6 WORD $0x4e216400 // smax.16b v0, v0, v1 WORD $0x4e30a800 // smaxv.16b b0, v0 WORD $0x1e260009 // fmov w9, s0 WORD $0xb400032b // cbz x11, LBB30_14 WORD $0xf100217f // cmp x11, #8 WORD $0x540001e3 // b.lo LBB30_12 BB30_9: WORD $0xaa0a03ed // mov x13, x10 WORD $0x9240084b // and x11, x2, #0x7 WORD $0x8b0a000c // add x12, x0, x10 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x0e010d20 // dup.8b v0, w9 WORD $0x8b0b01a9 // add x9, x13, x11 WORD $0xcb080129 // sub x9, x9, x8 BB30_10: WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x0e206420 // smax.8b v0, v1, v0 WORD $0xb1002129 // adds x9, x9, #8 WORD $0x54ffffa1 // b.ne LBB30_10 WORD $0x0e30a800 // smaxv.8b b0, v0 WORD $0x1e260009 // fmov w9, s0 WORD $0xb400012b // cbz x11, LBB30_14 BB30_12: WORD $0x8b0a000b // add x11, x0, x10 WORD $0xcb0a0108 // sub x8, x8, x10 BB30_13: WORD $0x38c0156a // ldrsb w10, [x11], #1 WORD $0x13001d29 // sxtb w9, w9 WORD $0x6b09015f // cmp w10, w9 WORD $0x1a89c149 // csel w9, w10, w9, gt WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB30_13 BB30_14: WORD $0x39000029 // strb w9, [x1] WORD $0xd65f03c0 // ret TEXT ·_int8_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt LBB31_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs LBB31_6 WORD $0xd2800009 // mov x9, #0 BB31_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 BB31_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0x38001549 // strb w9, [x10], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB31_4 BB31_5: WORD $0xd65f03c0 // ret BB31_6: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo LBB31_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo LBB31_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs LBB31_10 WORD $0xd2800009 // mov x9, #0 WORD $0x14000014 // b LBB31_14 BB31_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB31_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4e208440 // add.16b v0, v2, v0 WORD $0x4e218461 // add.16b v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0x54fffee1 // b.ne LBB31_11 WORD $0xb4fffc8a // cbz x10, LBB31_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo LBB31_3 BB31_14: WORD $0xaa0903ee // mov x14, x9 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x8b0a01ce // add x14, x14, x10 WORD $0xcb0801ce // sub x14, x14, x8 BB31_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x0e208420 // add.8b v0, v1, v0 WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x54ffff61 // b.ne LBB31_15 WORD $0xb5fff92a // cbnz x10, LBB31_3 WORD $0x17ffffd2 // b LBB31_5 TEXT ·_int8_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt LBB32_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs LBB32_6 WORD $0xd2800009 // mov x9, #0 BB32_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 BB32_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0x38001549 // strb w9, [x10], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB32_4 BB32_5: WORD $0xd65f03c0 // ret BB32_6: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo LBB32_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo LBB32_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs LBB32_10 WORD $0xd2800009 // mov x9, #0 WORD $0x14000014 // b LBB32_14 BB32_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB32_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6e228400 // sub.16b v0, v0, v2 WORD $0x6e238421 // sub.16b v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0x54fffee1 // b.ne LBB32_11 WORD $0xb4fffc8a // cbz x10, LBB32_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo LBB32_3 BB32_14: WORD $0xaa0903ee // mov x14, x9 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x8b0a01ce // add x14, x14, x10 WORD $0xcb0801ce // sub x14, x14, x8 BB32_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x2e218400 // sub.8b v0, v0, v1 WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x54ffff61 // b.ne LBB32_15 WORD $0xb5fff92a // cbnz x10, LBB32_3 WORD $0x17ffffd2 // b LBB32_5 TEXT ·_int8_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt LBB33_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs LBB33_6 WORD $0xd2800009 // mov x9, #0 BB33_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 BB33_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0x38001549 // strb w9, [x10], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB33_4 BB33_5: WORD $0xd65f03c0 // ret BB33_6: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo LBB33_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo LBB33_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs LBB33_10 WORD $0xd2800009 // mov x9, #0 WORD $0x14000014 // b LBB33_14 BB33_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB33_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4e209c40 // mul.16b v0, v2, v0 WORD $0x4e219c61 // mul.16b v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0x54fffee1 // b.ne LBB33_11 WORD $0xb4fffc8a // cbz x10, LBB33_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo LBB33_3 BB33_14: WORD $0xaa0903ee // mov x14, x9 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x8b0a01ce // add x14, x14, x10 WORD $0xcb0801ce // sub x14, x14, x8 BB33_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0x0e209c20 // mul.8b v0, v1, v0 WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x54ffff61 // b.ne LBB33_15 WORD $0xb5fff92a // cbnz x10, LBB33_3 WORD $0x17ffffd2 // b LBB33_5 TEXT ·_int8_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0xa9bd5ff8 // stp x24, x23, [sp, #-48]! ; 16-byte Folded Spill WORD $0xa90157f6 // stp x22, x21, [sp, #16] ; 16-byte Folded Spill WORD $0xa9024ff4 // stp x20, x19, [sp, #32] ; 16-byte Folded Spill WORD $0x7100047f // cmp w3, #1 WORD $0x54000d0b // b.lt LBB34_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs LBB34_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000059 // b LBB34_8 BB34_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000aa3 // b.lo LBB34_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x54000a43 // b.lo LBB34_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0903eb // mov x11, x9 WORD $0xaa0203ec // mov x12, x2 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0003ee // mov x14, x0 BB34_6: WORD $0x3cc105c1 // ldr q1, [x14], #16 WORD $0x3cc105a2 // ldr q2, [x13], #16 WORD $0x0f08a420 // sshll.8h v0, v1, #0 WORD $0x4f08a421 // sshll2.8h v1, v1, #0 WORD $0x4f08a443 // sshll2.8h v3, v2, #0 WORD $0x0e062c6f // smov.h w15, v3[1] WORD $0x0e062c30 // smov.h w16, v1[1] WORD $0x0e022c71 // smov.h w17, v3[0] WORD $0x0e022c23 // smov.h w3, v1[0] WORD $0x0e0a2c64 // smov.h w4, v3[2] WORD $0x0e0a2c25 // smov.h w5, v1[2] WORD $0x0e0e2c66 // smov.h w6, v3[3] WORD $0x0e0e2c27 // smov.h w7, v1[3] WORD $0x0e122c73 // smov.h w19, v3[4] WORD $0x0e122c34 // smov.h w20, v1[4] WORD $0x0f08a442 // sshll.8h v2, v2, #0 WORD $0x0e162c75 // smov.h w21, v3[5] WORD $0x0e162c36 // smov.h w22, v1[5] WORD $0x1acf0e0f // sdiv w15, w16, w15 WORD $0x1ad10c70 // sdiv w16, w3, w17 WORD $0x0e1a2c71 // smov.h w17, v3[6] WORD $0x0e1a2c23 // smov.h w3, v1[6] WORD $0x1ac40ca4 // sdiv w4, w5, w4 WORD $0x0e062c45 // smov.h w5, v2[1] WORD $0x0e062c17 // smov.h w23, v0[1] WORD $0x1ac50ee5 // sdiv w5, w23, w5 WORD $0x1e270204 // fmov s4, w16 WORD $0x0e022c50 // smov.h w16, v2[0] WORD $0x0e022c17 // smov.h w23, v0[0] WORD $0x4e061de4 // mov.h v4[1], w15 WORD $0x1ad00eef // sdiv w15, w23, w16 WORD $0x0e1e2c70 // smov.h w16, v3[7] WORD $0x1e2701e3 // fmov s3, w15 WORD $0x4e061ca3 // mov.h v3[1], w5 WORD $0x0e0a2c4f // smov.h w15, v2[2] WORD $0x0e0a2c05 // smov.h w5, v0[2] WORD $0x4e0a1c84 // mov.h v4[2], w4 WORD $0x1ac60ce4 // sdiv w4, w7, w6 WORD $0x1acf0caf // sdiv w15, w5, w15 WORD $0x0e0e2c45 // smov.h w5, v2[3] WORD $0x4e0a1de3 // mov.h v3[2], w15 WORD $0x1ad30e8f // sdiv w15, w20, w19 WORD $0x0e0e2c06 // smov.h w6, v0[3] WORD $0x1ac50cc5 // sdiv w5, w6, w5 WORD $0x4e0e1c84 // mov.h v4[3], w4 WORD $0x0e122c44 // smov.h w4, v2[4] WORD $0x0e122c06 // smov.h w6, v0[4] WORD $0x4e0e1ca3 // mov.h v3[3], w5 WORD $0x1ad50ec5 // sdiv w5, w22, w21 WORD $0x1ac40cc4 // sdiv w4, w6, w4 WORD $0x0e1e2c26 // smov.h w6, v1[7] WORD $0x4e121de4 // mov.h v4[4], w15 WORD $0x4e121c83 // mov.h v3[4], w4 WORD $0x0e162c4f // smov.h w15, v2[5] WORD $0x4e161ca4 // mov.h v4[5], w5 WORD $0x1ad10c71 // sdiv w17, w3, w17 WORD $0x0e162c03 // smov.h w3, v0[5] WORD $0x1acf0c6f // sdiv w15, w3, w15 WORD $0x4e161de3 // mov.h v3[5], w15 WORD $0x0e1a2c4f // smov.h w15, v2[6] WORD $0x0e1a2c03 // smov.h w3, v0[6] WORD $0x4e1a1e24 // mov.h v4[6], w17 WORD $0x1acf0c6f // sdiv w15, w3, w15 WORD $0x0e1e2c51 // smov.h w17, v2[7] WORD $0x4e1a1de3 // mov.h v3[6], w15 WORD $0x1ad00ccf // sdiv w15, w6, w16 WORD $0x0e1e2c10 // smov.h w16, v0[7] WORD $0x1ad10e10 // sdiv w16, w16, w17 WORD $0x4e1e1de4 // mov.h v4[7], w15 WORD $0x4e1e1e03 // mov.h v3[7], w16 WORD $0x4e041860 // uzp1.16b v0, v3, v4 WORD $0x3c810580 // str q0, [x12], #16 WORD $0xf100416b // subs x11, x11, #16 WORD $0x54fff6e1 // b.ne LBB34_6 WORD $0xb400016a // cbz x10, LBB34_10 BB34_8: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 BB34_9: WORD $0x38c01589 // ldrsb w9, [x12], #1 WORD $0x38c0156d // ldrsb w13, [x11], #1 WORD $0x1acd0d29 // sdiv w9, w9, w13 WORD $0x38001549 // strb w9, [x10], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB34_9 BB34_10: WORD $0xa9424ff4 // ldp x20, x19, [sp, #32] ; 16-byte Folded Reload WORD $0xa94157f6 // ldp x22, x21, [sp, #16] ; 16-byte Folded Reload WORD $0xa8c35ff8 // ldp x24, x23, [sp], #48 ; 16-byte Folded Reload WORD $0xd65f03c0 // ret TEXT ·_int16_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt LBB35_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x540000c2 // b.hs LBB35_4 WORD $0xd2800009 // mov x9, #0 WORD $0x5280000a // mov w10, #0 WORD $0x14000013 // b LBB35_7 BB35_3: WORD $0x7900003f // strh wzr, [x1] WORD $0xd65f03c0 // ret BB35_4: WORD $0x92400c4b // and x11, x2, #0xf WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x9100400a // add x10, x0, #16 WORD $0x6f00e400 // movi.2d v0, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi.2d v1, #0000000000000000 BB35_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x4e608440 // add.8h v0, v2, v0 WORD $0x4e618461 // add.8h v1, v3, v1 WORD $0x9100814a // add x10, x10, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x54ffff61 // b.ne LBB35_5 WORD $0x4e608420 // add.8h v0, v1, v0 WORD $0x4e71b800 // addv.8h h0, v0 WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, LBB35_9 BB35_7: WORD $0x8b09040b // add x11, x0, x9, lsl #1 WORD $0xcb090108 // sub x8, x8, x9 BB35_8: WORD $0x78402569 // ldrh w9, [x11], #2 WORD $0x0b0a012a // add w10, w9, w10 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB35_8 BB35_9: WORD $0x7900002a // strh w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_int16_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x79400008 // ldrh w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003cb // b.lt LBB36_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100413f // cmp x9, #16 WORD $0x54000062 // b.hs LBB36_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000011 // b LBB36_6 BB36_3: WORD $0x92400c4b // and x11, x2, #0xf WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x4e020d00 // dup.8h v0, w8 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB36_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x4e606c40 // smin.8h v0, v2, v0 WORD $0x4e616c61 // smin.8h v1, v3, v1 WORD $0x91008108 // add x8, x8, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x54ffff61 // b.ne LBB36_4 WORD $0x4e616c00 // smin.8h v0, v0, v1 WORD $0x4e71a800 // sminv.8h h0, v0 WORD $0x1e260008 // fmov w8, s0 WORD $0xb400012b // cbz x11, LBB36_8 BB36_6: WORD $0x8b0a040b // add x11, x0, x10, lsl #1 WORD $0xcb0a0129 // sub x9, x9, x10 BB36_7: WORD $0x78c0256a // ldrsh w10, [x11], #2 WORD $0x13003d08 // sxth w8, w8 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a88b148 // csel w8, w10, w8, lt WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff61 // b.ne LBB36_7 BB36_8: WORD $0x79000028 // strh w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_int16_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x79400008 // ldrh w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003cb // b.lt LBB37_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100413f // cmp x9, #16 WORD $0x54000062 // b.hs LBB37_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000011 // b LBB37_6 BB37_3: WORD $0x92400c4b // and x11, x2, #0xf WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x4e020d00 // dup.8h v0, w8 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB37_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x4e606440 // smax.8h v0, v2, v0 WORD $0x4e616461 // smax.8h v1, v3, v1 WORD $0x91008108 // add x8, x8, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x54ffff61 // b.ne LBB37_4 WORD $0x4e616400 // smax.8h v0, v0, v1 WORD $0x4e70a800 // smaxv.8h h0, v0 WORD $0x1e260008 // fmov w8, s0 WORD $0xb400012b // cbz x11, LBB37_8 BB37_6: WORD $0x8b0a040b // add x11, x0, x10, lsl #1 WORD $0xcb0a0129 // sub x9, x9, x10 BB37_7: WORD $0x78c0256a // ldrsh w10, [x11], #2 WORD $0x13003d08 // sxth w8, w8 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a88c148 // csel w8, w10, w8, gt WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff61 // b.ne LBB37_7 BB37_8: WORD $0x79000028 // strh w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_int16_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB38_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs LBB38_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB38_8 BB38_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB38_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB38_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB38_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4e608440 // add.8h v0, v2, v0 WORD $0x4e618461 // add.8h v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0x54fffee1 // b.ne LBB38_6 WORD $0xb400018a // cbz x10, LBB38_10 BB38_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB38_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0x78002549 // strh w9, [x10], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB38_9 BB38_10: WORD $0xd65f03c0 // ret TEXT ·_int16_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB39_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs LBB39_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB39_8 BB39_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB39_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB39_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB39_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6e628400 // sub.8h v0, v0, v2 WORD $0x6e638421 // sub.8h v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0x54fffee1 // b.ne LBB39_6 WORD $0xb400018a // cbz x10, LBB39_10 BB39_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB39_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0x78002549 // strh w9, [x10], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB39_9 BB39_10: WORD $0xd65f03c0 // ret TEXT ·_int16_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB40_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs LBB40_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB40_8 BB40_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB40_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB40_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB40_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4e609c40 // mul.8h v0, v2, v0 WORD $0x4e619c61 // mul.8h v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0x54fffee1 // b.ne LBB40_6 WORD $0xb400018a // cbz x10, LBB40_10 BB40_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB40_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0x78002549 // strh w9, [x10], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB40_9 BB40_10: WORD $0xd65f03c0 // ret TEXT ·_int16_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0xa9bf4ff4 // stp x20, x19, [sp, #-16]! ; 16-byte Folded Spill WORD $0x7100047f // cmp w3, #1 WORD $0x5400092b // b.lt LBB41_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB41_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000039 // b LBB41_8 BB41_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x540006a3 // b.lo LBB41_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x54000643 // b.lo LBB41_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0903eb // mov x11, x9 WORD $0xaa0203ec // mov x12, x2 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0003ee // mov x14, x0 BB41_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x0f10a401 // sshll.4s v1, v0, #0 WORD $0x4f10a400 // sshll2.4s v0, v0, #0 WORD $0x3cc105a2 // ldr q2, [x13], #16 WORD $0x0f10a443 // sshll.4s v3, v2, #0 WORD $0x4f10a442 // sshll2.4s v2, v2, #0 WORD $0x0e0c3c4f // mov.s w15, v2[1] WORD $0x0e0c3c10 // mov.s w16, v0[1] WORD $0x0e143c51 // mov.s w17, v2[2] WORD $0x0e143c03 // mov.s w3, v0[2] WORD $0x0e1c3c44 // mov.s w4, v2[3] WORD $0x0e1c3c05 // mov.s w5, v0[3] WORD $0x1e260046 // fmov w6, s2 WORD $0x1e260007 // fmov w7, s0 WORD $0x1ac60ce6 // sdiv w6, w7, w6 WORD $0x0e0c3c67 // mov.s w7, v3[1] WORD $0x1acf0e0f // sdiv w15, w16, w15 WORD $0x1ad10c70 // sdiv w16, w3, w17 WORD $0x0e0c3c31 // mov.s w17, v1[1] WORD $0x0e143c63 // mov.s w3, v3[2] WORD $0x1ac40ca4 // sdiv w4, w5, w4 WORD $0x1ac70e31 // sdiv w17, w17, w7 WORD $0x0e143c25 // mov.s w5, v1[2] WORD $0x0e1c3c67 // mov.s w7, v3[3] WORD $0x1e2700c0 // fmov s0, w6 WORD $0x1e260066 // fmov w6, s3 WORD $0x0e1c3c33 // mov.s w19, v1[3] WORD $0x1e260034 // fmov w20, s1 WORD $0x4e0c1de0 // mov.s v0[1], w15 WORD $0x1ac60e8f // sdiv w15, w20, w6 WORD $0x1e2701e1 // fmov s1, w15 WORD $0x4e0c1e21 // mov.s v1[1], w17 WORD $0x4e141e00 // mov.s v0[2], w16 WORD $0x1ac30caf // sdiv w15, w5, w3 WORD $0x4e141de1 // mov.s v1[2], w15 WORD $0x1ac70e6f // sdiv w15, w19, w7 WORD $0x4e1c1c80 // mov.s v0[3], w4 WORD $0x4e1c1de1 // mov.s v1[3], w15 WORD $0x4e401820 // uzp1.8h v0, v1, v0 WORD $0x3c810580 // str q0, [x12], #16 WORD $0xf100216b // subs x11, x11, #8 WORD $0x54fffae1 // b.ne LBB41_6 WORD $0xb400018a // cbz x10, LBB41_10 BB41_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB41_9: WORD $0x78c02589 // ldrsh w9, [x12], #2 WORD $0x78c0256d // ldrsh w13, [x11], #2 WORD $0x1acd0d29 // sdiv w9, w9, w13 WORD $0x78002549 // strh w9, [x10], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB41_9 BB41_10: WORD $0xa8c14ff4 // ldp x20, x19, [sp], #16 ; 16-byte Folded Reload WORD $0xd65f03c0 // ret TEXT ·_int32_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt LBB42_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540000c2 // b.hs LBB42_4 WORD $0xd2800009 // mov x9, #0 WORD $0x5280000a // mov w10, #0 WORD $0x14000013 // b LBB42_7 BB42_3: WORD $0xb900003f // str wzr, [x1] WORD $0xd65f03c0 // ret BB42_4: WORD $0x9240084b // and x11, x2, #0x7 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x9100400a // add x10, x0, #16 WORD $0x6f00e400 // movi.2d v0, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi.2d v1, #0000000000000000 BB42_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x4ea08440 // add.4s v0, v2, v0 WORD $0x4ea18461 // add.4s v1, v3, v1 WORD $0x9100814a // add x10, x10, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x54ffff61 // b.ne LBB42_5 WORD $0x4ea08420 // add.4s v0, v1, v0 WORD $0x4eb1b800 // addv.4s s0, v0 WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, LBB42_9 BB42_7: WORD $0x8b09080b // add x11, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 BB42_8: WORD $0xb8404569 // ldr w9, [x11], #4 WORD $0x0b0a012a // add w10, w9, w10 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB42_8 BB42_9: WORD $0xb900002a // str w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_int32_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xb9400008 // ldr w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003ab // b.lt LBB43_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100213f // cmp x9, #8 WORD $0x54000062 // b.hs LBB43_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000011 // b LBB43_6 BB43_3: WORD $0x9240084b // and x11, x2, #0x7 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x4e040d00 // dup.4s v0, w8 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB43_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x4ea06c40 // smin.4s v0, v2, v0 WORD $0x4ea16c61 // smin.4s v1, v3, v1 WORD $0x91008108 // add x8, x8, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x54ffff61 // b.ne LBB43_4 WORD $0x4ea16c00 // smin.4s v0, v0, v1 WORD $0x4eb1a800 // sminv.4s s0, v0 WORD $0x1e260008 // fmov w8, s0 WORD $0xb400010b // cbz x11, LBB43_8 BB43_6: WORD $0x8b0a080b // add x11, x0, x10, lsl #2 WORD $0xcb0a0129 // sub x9, x9, x10 BB43_7: WORD $0xb840456a // ldr w10, [x11], #4 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a88b148 // csel w8, w10, w8, lt WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff81 // b.ne LBB43_7 BB43_8: WORD $0xb9000028 // str w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_int32_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xb9400008 // ldr w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003ab // b.lt LBB44_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100213f // cmp x9, #8 WORD $0x54000062 // b.hs LBB44_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000011 // b LBB44_6 BB44_3: WORD $0x9240084b // and x11, x2, #0x7 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x4e040d00 // dup.4s v0, w8 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB44_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x4ea06440 // smax.4s v0, v2, v0 WORD $0x4ea16461 // smax.4s v1, v3, v1 WORD $0x91008108 // add x8, x8, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x54ffff61 // b.ne LBB44_4 WORD $0x4ea16400 // smax.4s v0, v0, v1 WORD $0x4eb0a800 // smaxv.4s s0, v0 WORD $0x1e260008 // fmov w8, s0 WORD $0xb400010b // cbz x11, LBB44_8 BB44_6: WORD $0x8b0a080b // add x11, x0, x10, lsl #2 WORD $0xcb0a0129 // sub x9, x9, x10 BB44_7: WORD $0xb840456a // ldr w10, [x11], #4 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a88c148 // csel w8, w10, w8, gt WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff81 // b.ne LBB44_7 BB44_8: WORD $0xb9000028 // str w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_int32_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB45_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB45_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB45_8 BB45_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB45_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB45_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB45_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4ea08440 // add.4s v0, v2, v0 WORD $0x4ea18461 // add.4s v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0x54fffee1 // b.ne LBB45_6 WORD $0xb400018a // cbz x10, LBB45_10 BB45_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB45_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0xb8004549 // str w9, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB45_9 BB45_10: WORD $0xd65f03c0 // ret TEXT ·_int32_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB46_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB46_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB46_8 BB46_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB46_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB46_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB46_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6ea28400 // sub.4s v0, v0, v2 WORD $0x6ea38421 // sub.4s v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0x54fffee1 // b.ne LBB46_6 WORD $0xb400018a // cbz x10, LBB46_10 BB46_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB46_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0xb8004549 // str w9, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB46_9 BB46_10: WORD $0xd65f03c0 // ret TEXT ·_int32_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB47_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB47_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB47_8 BB47_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB47_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB47_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB47_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4ea09c40 // mul.4s v0, v2, v0 WORD $0x4ea19c61 // mul.4s v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0x54fffee1 // b.ne LBB47_6 WORD $0xb400018a // cbz x10, LBB47_10 BB47_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB47_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0xb8004549 // str w9, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB47_9 BB47_10: WORD $0xd65f03c0 // ret TEXT ·_int32_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400068b // b.lt LBB48_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB48_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000024 // b LBB48_8 BB48_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000403 // b.lo LBB48_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540003a3 // b.lo LBB48_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0903eb // mov x11, x9 WORD $0xaa0203ec // mov x12, x2 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0003ee // mov x14, x0 BB48_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0x0e0c3c2f // mov.s w15, v1[1] WORD $0x0e0c3c10 // mov.s w16, v0[1] WORD $0x0e143c31 // mov.s w17, v1[2] WORD $0x0e143c03 // mov.s w3, v0[2] WORD $0x0e1c3c24 // mov.s w4, v1[3] WORD $0x1e260025 // fmov w5, s1 WORD $0x0e1c3c06 // mov.s w6, v0[3] WORD $0x1e260007 // fmov w7, s0 WORD $0x1ac50ce5 // sdiv w5, w7, w5 WORD $0x1acf0e0f // sdiv w15, w16, w15 WORD $0x1e2700a0 // fmov s0, w5 WORD $0x4e0c1de0 // mov.s v0[1], w15 WORD $0x1ad10c6f // sdiv w15, w3, w17 WORD $0x4e141de0 // mov.s v0[2], w15 WORD $0x1ac40ccf // sdiv w15, w6, w4 WORD $0x4e1c1de0 // mov.s v0[3], w15 WORD $0x3c810580 // str q0, [x12], #16 WORD $0xf100116b // subs x11, x11, #4 WORD $0x54fffd81 // b.ne LBB48_6 WORD $0xb400018a // cbz x10, LBB48_10 BB48_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB48_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0x1acd0d29 // sdiv w9, w9, w13 WORD $0xb8004549 // str w9, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB48_9 BB48_10: WORD $0xd65f03c0 // ret TEXT ·_int64_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt LBB49_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x540000c2 // b.hs LBB49_4 WORD $0xd2800009 // mov x9, #0 WORD $0xd280000a // mov x10, #0 WORD $0x14000013 // b LBB49_7 BB49_3: WORD $0xf900003f // str xzr, [x1] WORD $0xd65f03c0 // ret BB49_4: WORD $0x9240044b // and x11, x2, #0x3 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x9100400a // add x10, x0, #16 WORD $0x6f00e400 // movi.2d v0, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi.2d v1, #0000000000000000 BB49_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x4ee08440 // add.2d v0, v2, v0 WORD $0x4ee18461 // add.2d v1, v3, v1 WORD $0x9100814a // add x10, x10, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x54ffff61 // b.ne LBB49_5 WORD $0x4ee08420 // add.2d v0, v1, v0 WORD $0x5ef1b800 // addp.2d d0, v0 WORD $0x9e66000a // fmov x10, d0 WORD $0xb40000eb // cbz x11, LBB49_9 BB49_7: WORD $0x8b090c0b // add x11, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 BB49_8: WORD $0xf8408569 // ldr x9, [x11], #8 WORD $0x8b0a012a // add x10, x9, x10 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB49_8 BB49_9: WORD $0xf900002a // str x10, [x1] WORD $0xd65f03c0 // ret TEXT ·_int64_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xf9400009 // ldr x9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400044b // b.lt LBB50_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB50_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000016 // b LBB50_6 BB50_3: WORD $0x9240044b // and x11, x2, #0x3 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x4e080d20 // dup.2d v0, x9 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB50_4: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x4ee23404 // cmgt.2d v4, v0, v2 WORD $0x6ea41c40 // bit.16b v0, v2, v4 WORD $0x4ee33422 // cmgt.2d v2, v1, v3 WORD $0x6ea21c61 // bit.16b v1, v3, v2 WORD $0x91008129 // add x9, x9, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x54ffff21 // b.ne LBB50_4 WORD $0x4ee03422 // cmgt.2d v2, v1, v0 WORD $0x6ee21c20 // bif.16b v0, v1, v2 WORD $0x6e004001 // ext.16b v1, v0, v0, #8 WORD $0x5ee03422 // cmgt d2, d1, d0 WORD $0x2ee21c20 // bif.8b v0, v1, v2 WORD $0x9e660009 // fmov x9, d0 WORD $0xb400010b // cbz x11, LBB50_8 BB50_6: WORD $0x8b0a0c0b // add x11, x0, x10, lsl #3 WORD $0xcb0a0108 // sub x8, x8, x10 BB50_7: WORD $0xf840856a // ldr x10, [x11], #8 WORD $0xeb09015f // cmp x10, x9 WORD $0x9a89b149 // csel x9, x10, x9, lt WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff81 // b.ne LBB50_7 BB50_8: WORD $0xf9000029 // str x9, [x1] WORD $0xd65f03c0 // ret TEXT ·_int64_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xf9400009 // ldr x9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400044b // b.lt LBB51_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB51_3 WORD $0xd280000a // mov x10, #0 WORD $0x14000016 // b LBB51_6 BB51_3: WORD $0x9240044b // and x11, x2, #0x3 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x4e080d20 // dup.2d v0, x9 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov.16b v1, v0 BB51_4: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x4ee03444 // cmgt.2d v4, v2, v0 WORD $0x6ea41c40 // bit.16b v0, v2, v4 WORD $0x4ee13462 // cmgt.2d v2, v3, v1 WORD $0x6ea21c61 // bit.16b v1, v3, v2 WORD $0x91008129 // add x9, x9, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x54ffff21 // b.ne LBB51_4 WORD $0x4ee13402 // cmgt.2d v2, v0, v1 WORD $0x6ee21c20 // bif.16b v0, v1, v2 WORD $0x6e004001 // ext.16b v1, v0, v0, #8 WORD $0x5ee13402 // cmgt d2, d0, d1 WORD $0x2ee21c20 // bif.8b v0, v1, v2 WORD $0x9e660009 // fmov x9, d0 WORD $0xb400010b // cbz x11, LBB51_8 BB51_6: WORD $0x8b0a0c0b // add x11, x0, x10, lsl #3 WORD $0xcb0a0108 // sub x8, x8, x10 BB51_7: WORD $0xf840856a // ldr x10, [x11], #8 WORD $0xeb09015f // cmp x10, x9 WORD $0x9a89c149 // csel x9, x10, x9, gt WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff81 // b.ne LBB51_7 BB51_8: WORD $0xf9000029 // str x9, [x1] WORD $0xd65f03c0 // ret TEXT ·_int64_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB52_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB52_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB52_8 BB52_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB52_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB52_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB52_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4ee08440 // add.2d v0, v2, v0 WORD $0x4ee18461 // add.2d v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x54fffee1 // b.ne LBB52_6 WORD $0xb400018a // cbz x10, LBB52_10 BB52_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB52_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0x8b0901a9 // add x9, x13, x9 WORD $0xf8008549 // str x9, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB52_9 BB52_10: WORD $0xd65f03c0 // ret TEXT ·_int64_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB53_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB53_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB53_8 BB53_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB53_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB53_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB53_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6ee28400 // sub.2d v0, v0, v2 WORD $0x6ee38421 // sub.2d v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x54fffee1 // b.ne LBB53_6 WORD $0xb400018a // cbz x10, LBB53_10 BB53_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB53_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0xcb0d0129 // sub x9, x9, x13 WORD $0xf8008549 // str x9, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB53_9 BB53_10: WORD $0xd65f03c0 // ret TEXT ·_int64_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540006eb // b.lt LBB54_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB54_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000027 // b LBB54_8 BB54_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54000463 // b.lo LBB54_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000403 // b.lo LBB54_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB54_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x3cdf0182 // ldur q2, [x12, #-16] WORD $0x4e183c0f // mov.d x15, v0[1] WORD $0x4e183c50 // mov.d x16, v2[1] WORD $0x9b0f7e0f // mul x15, x16, x15 WORD $0x9e660010 // fmov x16, d0 WORD $0x9e660051 // fmov x17, d2 WORD $0x9b107e30 // mul x16, x17, x16 WORD $0x9e670200 // fmov d0, x16 WORD $0x4e181de0 // mov.d v0[1], x15 WORD $0x3cc20582 // ldr q2, [x12], #32 WORD $0x4e183c2f // mov.d x15, v1[1] WORD $0x4e183c50 // mov.d x16, v2[1] WORD $0x9b0f7e0f // mul x15, x16, x15 WORD $0x9e660030 // fmov x16, d1 WORD $0x9e660051 // fmov x17, d2 WORD $0x9b107e30 // mul x16, x17, x16 WORD $0x9e670201 // fmov d1, x16 WORD $0x4e181de1 // mov.d v1[1], x15 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x54fffd21 // b.ne LBB54_6 WORD $0xb400018a // cbz x10, LBB54_10 BB54_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB54_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0x9b097da9 // mul x9, x13, x9 WORD $0xf8008549 // str x9, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB54_9 BB54_10: WORD $0xd65f03c0 // ret TEXT ·_int64_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400058b // b.lt LBB55_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100091f // cmp x8, #2 WORD $0x54000062 // b.hs LBB55_3 WORD $0xd2800009 // mov x9, #0 WORD $0x1400001c // b LBB55_8 BB55_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000303 // b.lo LBB55_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540002a3 // b.lo LBB55_8 WORD $0x9240006a // and x10, x3, #0x1 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0903eb // mov x11, x9 WORD $0xaa0203ec // mov x12, x2 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0003ee // mov x14, x0 BB55_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0x4e183c2f // mov.d x15, v1[1] WORD $0x4e183c10 // mov.d x16, v0[1] WORD $0x9acf0e0f // sdiv x15, x16, x15 WORD $0x9e660030 // fmov x16, d1 WORD $0x9e660011 // fmov x17, d0 WORD $0x9ad00e30 // sdiv x16, x17, x16 WORD $0x9e670200 // fmov d0, x16 WORD $0x4e181de0 // mov.d v0[1], x15 WORD $0x3c810580 // str q0, [x12], #16 WORD $0xf100096b // subs x11, x11, #2 WORD $0x54fffe81 // b.ne LBB55_6 WORD $0xb400018a // cbz x10, LBB55_10 BB55_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB55_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0x9acd0d29 // sdiv x9, x9, x13 WORD $0xf8008549 // str x9, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB55_9 BB55_10: WORD $0xd65f03c0 // ret TEXT ·_float32_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt LBB56_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540000e2 // b.hs LBB56_4 WORD $0xd2800009 // mov x9, #0 WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0x14000014 // b LBB56_7 BB56_3: WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0xbd000020 // str s0, [x1] WORD $0xd65f03c0 // ret BB56_4: WORD $0x9240084a // and x10, x2, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x6f00e400 // movi.2d v0, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi.2d v1, #0000000000000000 BB56_5: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x4e20d440 // fadd.4s v0, v2, v0 WORD $0x4e21d461 // fadd.4s v1, v3, v1 WORD $0x9100816b // add x11, x11, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x54ffff61 // b.ne LBB56_5 WORD $0x4e20d420 // fadd.4s v0, v1, v0 WORD $0x6e20d400 // faddp.4s v0, v0, v0 WORD $0x7e30d800 // faddp.2s s0, v0 WORD $0xb40000ea // cbz x10, LBB56_9 BB56_7: WORD $0x8b09080a // add x10, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 BB56_8: WORD $0xbc404541 // ldr s1, [x10], #4 WORD $0x1e202820 // fadd s0, s1, s0 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB56_8 BB56_9: WORD $0xbd000020 // str s0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float32_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xbd400000 // ldr s0, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400036b // b.lt LBB57_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB57_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000010 // b LBB57_6 BB57_3: WORD $0x9240084a // and x10, x2, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x4e040400 // dup.4s v0, v0[0] WORD $0x9100400b // add x11, x0, #16 WORD $0xaa0903ec // mov x12, x9 WORD $0x4ea01c01 // mov.16b v1, v0 BB57_4: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x4ea0c440 // fminnm.4s v0, v2, v0 WORD $0x4ea1c461 // fminnm.4s v1, v3, v1 WORD $0x9100816b // add x11, x11, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x54ffff61 // b.ne LBB57_4 WORD $0x4ea1c400 // fminnm.4s v0, v0, v1 WORD $0x6eb0c800 // fminnmv.4s s0, v0 WORD $0xb40000ea // cbz x10, LBB57_8 BB57_6: WORD $0x8b09080a // add x10, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 BB57_7: WORD $0xbc404541 // ldr s1, [x10], #4 WORD $0x1e207820 // fminnm s0, s1, s0 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB57_7 BB57_8: WORD $0xbd000020 // str s0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float32_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xbd400000 // ldr s0, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400036b // b.lt LBB58_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB58_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000010 // b LBB58_6 BB58_3: WORD $0x9240084a // and x10, x2, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x4e040400 // dup.4s v0, v0[0] WORD $0x9100400b // add x11, x0, #16 WORD $0xaa0903ec // mov x12, x9 WORD $0x4ea01c01 // mov.16b v1, v0 BB58_4: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x4e20c440 // fmaxnm.4s v0, v2, v0 WORD $0x4e21c461 // fmaxnm.4s v1, v3, v1 WORD $0x9100816b // add x11, x11, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x54ffff61 // b.ne LBB58_4 WORD $0x4e21c400 // fmaxnm.4s v0, v0, v1 WORD $0x6e30c800 // fmaxnmv.4s s0, v0 WORD $0xb40000ea // cbz x10, LBB58_8 BB58_6: WORD $0x8b09080a // add x10, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 BB58_7: WORD $0xbc404541 // ldr s1, [x10], #4 WORD $0x1e206820 // fmaxnm s0, s1, s0 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB58_7 BB58_8: WORD $0xbd000020 // str s0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float32_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB59_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB59_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB59_8 BB59_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB59_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB59_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB59_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4e20d440 // fadd.4s v0, v2, v0 WORD $0x4e21d461 // fadd.4s v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0x54fffee1 // b.ne LBB59_6 WORD $0xb400018a // cbz x10, LBB59_10 BB59_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB59_9: WORD $0xbc404580 // ldr s0, [x12], #4 WORD $0xbc404561 // ldr s1, [x11], #4 WORD $0x1e202820 // fadd s0, s1, s0 WORD $0xbc004540 // str s0, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB59_9 BB59_10: WORD $0xd65f03c0 // ret TEXT ·_float32_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB60_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB60_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB60_8 BB60_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB60_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB60_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB60_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4ea2d400 // fsub.4s v0, v0, v2 WORD $0x4ea3d421 // fsub.4s v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0x54fffee1 // b.ne LBB60_6 WORD $0xb400018a // cbz x10, LBB60_10 BB60_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB60_9: WORD $0xbc404580 // ldr s0, [x12], #4 WORD $0xbc404561 // ldr s1, [x11], #4 WORD $0x1e213800 // fsub s0, s0, s1 WORD $0xbc004540 // str s0, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB60_9 BB60_10: WORD $0xd65f03c0 // ret TEXT ·_float32_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB61_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB61_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB61_8 BB61_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB61_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB61_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB61_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6e20dc40 // fmul.4s v0, v2, v0 WORD $0x6e21dc61 // fmul.4s v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0x54fffee1 // b.ne LBB61_6 WORD $0xb400018a // cbz x10, LBB61_10 BB61_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB61_9: WORD $0xbc404580 // ldr s0, [x12], #4 WORD $0xbc404561 // ldr s1, [x11], #4 WORD $0x1e200820 // fmul s0, s1, s0 WORD $0xbc004540 // str s0, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB61_9 BB61_10: WORD $0xd65f03c0 // ret TEXT ·_float32_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB62_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs LBB62_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB62_8 BB62_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB62_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB62_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB62_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6e22fc00 // fdiv.4s v0, v0, v2 WORD $0x6e23fc21 // fdiv.4s v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0x54fffee1 // b.ne LBB62_6 WORD $0xb400018a // cbz x10, LBB62_10 BB62_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB62_9: WORD $0xbc404580 // ldr s0, [x12], #4 WORD $0xbc404561 // ldr s1, [x11], #4 WORD $0x1e211800 // fdiv s0, s0, s1 WORD $0xbc004540 // str s0, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB62_9 BB62_10: WORD $0xd65f03c0 // ret TEXT ·_float64_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt LBB63_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x540000e2 // b.hs LBB63_4 WORD $0xd2800009 // mov x9, #0 WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0x14000013 // b LBB63_7 BB63_3: WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0xfd000020 // str d0, [x1] WORD $0xd65f03c0 // ret BB63_4: WORD $0x9240044a // and x10, x2, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x6f00e400 // movi.2d v0, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi.2d v1, #0000000000000000 BB63_5: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x4e60d440 // fadd.2d v0, v2, v0 WORD $0x4e61d461 // fadd.2d v1, v3, v1 WORD $0x9100816b // add x11, x11, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x54ffff61 // b.ne LBB63_5 WORD $0x4e60d420 // fadd.2d v0, v1, v0 WORD $0x7e70d800 // faddp.2d d0, v0 WORD $0xb40000ea // cbz x10, LBB63_9 BB63_7: WORD $0x8b090c0a // add x10, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 BB63_8: WORD $0xfc408541 // ldr d1, [x10], #8 WORD $0x1e602820 // fadd d0, d1, d0 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB63_8 BB63_9: WORD $0xfd000020 // str d0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float64_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xfd400000 // ldr d0, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400036b // b.lt LBB64_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB64_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000010 // b LBB64_6 BB64_3: WORD $0x9240044a // and x10, x2, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x4e080400 // dup.2d v0, v0[0] WORD $0x9100400b // add x11, x0, #16 WORD $0xaa0903ec // mov x12, x9 WORD $0x4ea01c01 // mov.16b v1, v0 BB64_4: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x4ee0c440 // fminnm.2d v0, v2, v0 WORD $0x4ee1c461 // fminnm.2d v1, v3, v1 WORD $0x9100816b // add x11, x11, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x54ffff61 // b.ne LBB64_4 WORD $0x4ee1c400 // fminnm.2d v0, v0, v1 WORD $0x7ef0c800 // fminnmp.2d d0, v0 WORD $0xb40000ea // cbz x10, LBB64_8 BB64_6: WORD $0x8b090c0a // add x10, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 BB64_7: WORD $0xfc408541 // ldr d1, [x10], #8 WORD $0x1e607820 // fminnm d0, d1, d0 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB64_7 BB64_8: WORD $0xfd000020 // str d0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float64_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xfd400000 // ldr d0, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400036b // b.lt LBB65_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB65_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000010 // b LBB65_6 BB65_3: WORD $0x9240044a // and x10, x2, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x4e080400 // dup.2d v0, v0[0] WORD $0x9100400b // add x11, x0, #16 WORD $0xaa0903ec // mov x12, x9 WORD $0x4ea01c01 // mov.16b v1, v0 BB65_4: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x4e60c440 // fmaxnm.2d v0, v2, v0 WORD $0x4e61c461 // fmaxnm.2d v1, v3, v1 WORD $0x9100816b // add x11, x11, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x54ffff61 // b.ne LBB65_4 WORD $0x4e61c400 // fmaxnm.2d v0, v0, v1 WORD $0x7e70c800 // fmaxnmp.2d d0, v0 WORD $0xb40000ea // cbz x10, LBB65_8 BB65_6: WORD $0x8b090c0a // add x10, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 BB65_7: WORD $0xfc408541 // ldr d1, [x10], #8 WORD $0x1e606820 // fmaxnm d0, d1, d0 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffffa1 // b.ne LBB65_7 BB65_8: WORD $0xfd000020 // str d0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float64_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB66_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB66_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB66_8 BB66_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB66_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB66_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB66_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4e60d440 // fadd.2d v0, v2, v0 WORD $0x4e61d461 // fadd.2d v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x54fffee1 // b.ne LBB66_6 WORD $0xb400018a // cbz x10, LBB66_10 BB66_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB66_9: WORD $0xfc408580 // ldr d0, [x12], #8 WORD $0xfc408561 // ldr d1, [x11], #8 WORD $0x1e602820 // fadd d0, d1, d0 WORD $0xfc008540 // str d0, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB66_9 BB66_10: WORD $0xd65f03c0 // ret TEXT ·_float64_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB67_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB67_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB67_8 BB67_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB67_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB67_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB67_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x4ee2d400 // fsub.2d v0, v0, v2 WORD $0x4ee3d421 // fsub.2d v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x54fffee1 // b.ne LBB67_6 WORD $0xb400018a // cbz x10, LBB67_10 BB67_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB67_9: WORD $0xfc408580 // ldr d0, [x12], #8 WORD $0xfc408561 // ldr d1, [x11], #8 WORD $0x1e613800 // fsub d0, d0, d1 WORD $0xfc008540 // str d0, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB67_9 BB67_10: WORD $0xd65f03c0 // ret TEXT ·_float64_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB68_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB68_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB68_8 BB68_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB68_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB68_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB68_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6e60dc40 // fmul.2d v0, v2, v0 WORD $0x6e61dc61 // fmul.2d v1, v3, v1 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x54fffee1 // b.ne LBB68_6 WORD $0xb400018a // cbz x10, LBB68_10 BB68_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB68_9: WORD $0xfc408580 // ldr d0, [x12], #8 WORD $0xfc408561 // ldr d1, [x11], #8 WORD $0x1e600820 // fmul d0, d1, d0 WORD $0xfc008540 // str d0, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB68_9 BB68_10: WORD $0xd65f03c0 // ret TEXT ·_float64_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt LBB69_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs LBB69_3 WORD $0xd2800009 // mov x9, #0 WORD $0x14000019 // b LBB69_8 BB69_3: WORD $0xd2800009 // mov x9, #0 WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo LBB69_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo LBB69_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 BB69_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x6e62fc00 // fdiv.2d v0, v0, v2 WORD $0x6e63fc21 // fdiv.2d v1, v1, v3 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0x9100818c // add x12, x12, #32 WORD $0x910081ad // add x13, x13, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x54fffee1 // b.ne LBB69_6 WORD $0xb400018a // cbz x10, LBB69_10 BB69_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 WORD $0xcb090108 // sub x8, x8, x9 BB69_9: WORD $0xfc408580 // ldr d0, [x12], #8 WORD $0xfc408561 // ldr d1, [x11], #8 WORD $0x1e611800 // fdiv d0, d0, d1 WORD $0xfc008540 // str d0, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne LBB69_9 BB69_10: WORD $0xd65f03c0 // ret golang-github-kelindar-simd-1.2.0/simd_avx2_amd64.go000066400000000000000000000200611517522302000221740ustar00rootroot00000000000000//go:build !noasm && amd64 // AUTO-GENERATED BY GOCC -- DO NOT EDIT package simd import "unsafe" //go:nosplit //go:noescape func _uint8_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) golang-github-kelindar-simd-1.2.0/simd_avx2_amd64.s000066400000000000000000011605451517522302000220460ustar00rootroot00000000000000//go:build !noasm && amd64 // AUTO-GENERATED BY GOCC -- DO NOT EDIT TEXT ·_uint8_sum(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xd285 // test edx, edx JLE LBB0_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB0_4 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d WORD $0xc031 // xor eax, eax JMP LBB0_13 LBB0_1: WORD $0xc031 // xor eax, eax JMP LBB0_14 LBB0_4: LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB0_6 WORD $0xc031 // xor eax, eax WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB0_10 LBB0_6: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x7fe18341 // and r9d, 127 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 LBB0_7: LONG $0x04fcfdc5; BYTE $0x07 // vpaddb ymm0, ymm0, ymmword ptr [rdi + rax] LONG $0x4cfcf5c5; WORD $0x2007 // vpaddb ymm1, ymm1, ymmword ptr [rdi + rax + 32] LONG $0x54fcedc5; WORD $0x4007 // vpaddb ymm2, ymm2, ymmword ptr [rdi + rax + 64] LONG $0x5cfce5c5; WORD $0x6007 // vpaddb ymm3, ymm3, ymmword ptr [rdi + rax + 96] LONG $0x80e88348 // sub rax, -128 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB0_7 LONG $0xc0fcf5c5 // vpaddb ymm0, ymm1, ymm0 LONG $0xc0fcedc5 // vpaddb ymm0, ymm2, ymm0 LONG $0xc0fce5c5 // vpaddb ymm0, ymm3, ymm0 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1fcf9c5 // vpaddb xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1fcf9c5 // vpaddb xmm0, xmm0, xmm1 LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xc1f6f9c5 // vpsadbw xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB0_14 LONG $0x10f98341 // cmp r9d, 16 JB LBB0_13 LBB0_10: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0xc06ef9c5 // vmovd xmm0, eax LBB0_11: LONG $0x04fcf9c5; BYTE $0x0f // vpaddb xmm0, xmm0, xmmword ptr [rdi + rcx] LONG $0x10c18348 // add rcx, 16 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB0_11 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1fcf9c5 // vpaddb xmm0, xmm0, xmm1 LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xc1f6f9c5 // vpsadbw xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB0_14 LBB0_13: LONG $0x17040242 // add al, byte ptr [rdi + r10] WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB0_13 LBB0_14: WORD $0x0688 // mov byte ptr [rsi], al WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint8_min(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xb60f; BYTE $0x07 // movzx eax, byte ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB1_13 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB1_3 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB1_12 LBB1_3: LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB1_5 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB1_9 LBB1_5: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x7fe18341 // and r9d, 127 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB1_6: LONG $0x04dafdc5; BYTE $0x07 // vpminub ymm0, ymm0, ymmword ptr [rdi + rax] LONG $0x4cdaf5c5; WORD $0x2007 // vpminub ymm1, ymm1, ymmword ptr [rdi + rax + 32] LONG $0x54daedc5; WORD $0x4007 // vpminub ymm2, ymm2, ymmword ptr [rdi + rax + 64] LONG $0x5cdae5c5; WORD $0x6007 // vpminub ymm3, ymm3, ymmword ptr [rdi + rax + 96] LONG $0x80e88348 // sub rax, -128 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB1_6 LONG $0xc1dafdc5 // vpminub ymm0, ymm0, ymm1 LONG $0xc2dafdc5 // vpminub ymm0, ymm0, ymm2 LONG $0xc3dafdc5 // vpminub ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB1_13 LONG $0x10f98341 // cmp r9d, 16 JB LBB1_12 LBB1_9: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x7879e2c4; BYTE $0xc0 // vpbroadcastb xmm0, xmm0 LBB1_10: LONG $0x04daf9c5; BYTE $0x0f // vpminub xmm0, xmm0, xmmword ptr [rdi + rcx] LONG $0x10c18348 // add rcx, 16 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB1_10 LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB1_13 LBB1_12: LONG $0x0cb60f42; BYTE $0x17 // movzx ecx, byte ptr [rdi + r10] WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xc138 // cmp cl, al WORD $0x420f; BYTE $0xc1 // cmovb eax, ecx WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB1_12 LBB1_13: WORD $0x0688 // mov byte ptr [rsi], al WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint8_max(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xb60f; BYTE $0x07 // movzx eax, byte ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB2_13 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB2_3 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB2_12 LBB2_3: LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB2_5 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB2_9 LBB2_5: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x7fe18341 // and r9d, 127 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB2_6: LONG $0x04defdc5; BYTE $0x07 // vpmaxub ymm0, ymm0, ymmword ptr [rdi + rax] LONG $0x4cdef5c5; WORD $0x2007 // vpmaxub ymm1, ymm1, ymmword ptr [rdi + rax + 32] LONG $0x54deedc5; WORD $0x4007 // vpmaxub ymm2, ymm2, ymmword ptr [rdi + rax + 64] LONG $0x5cdee5c5; WORD $0x6007 // vpmaxub ymm3, ymm3, ymmword ptr [rdi + rax + 96] LONG $0x80e88348 // sub rax, -128 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB2_6 LONG $0xc1defdc5 // vpmaxub ymm0, ymm0, ymm1 LONG $0xc2defdc5 // vpmaxub ymm0, ymm0, ymm2 LONG $0xc3defdc5 // vpmaxub ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1def9c5 // vpmaxub xmm0, xmm0, xmm1 LONG $0xc976f1c5 // vpcmpeqd xmm1, xmm1, xmm1 LONG $0xc1eff9c5 // vpxor xmm0, xmm0, xmm1 LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0xd0f6 // not al WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB2_13 LONG $0x10f98341 // cmp r9d, 16 JB LBB2_12 LBB2_9: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x7879e2c4; BYTE $0xc0 // vpbroadcastb xmm0, xmm0 LBB2_10: LONG $0x04def9c5; BYTE $0x0f // vpmaxub xmm0, xmm0, xmmword ptr [rdi + rcx] LONG $0x10c18348 // add rcx, 16 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB2_10 LONG $0xc976f1c5 // vpcmpeqd xmm1, xmm1, xmm1 LONG $0xc1eff9c5 // vpxor xmm0, xmm0, xmm1 LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0xd0f6 // not al WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB2_13 LBB2_12: LONG $0x0cb60f42; BYTE $0x17 // movzx ecx, byte ptr [rdi + r10] WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xc138 // cmp cl, al WORD $0x470f; BYTE $0xc1 // cmova eax, ecx WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB2_12 LBB2_13: WORD $0x0688 // mov byte ptr [rsi], al WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint8_add(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB3_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB3_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB3_14: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB3_16 LBB3_15: LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte ptr [rsi + r11] LONG $0x1f040242 // add al, byte ptr [rdi + r11] LONG $0x1a048842 // mov byte ptr [rdx + r11], al WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB3_15 LBB3_16: LONG $0x03f98349 // cmp r9, 3 JB LBB3_18 LBB3_17: LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte ptr [rsi + r11] LONG $0x1f040242 // add al, byte ptr [rdi + r11] LONG $0x1a048842 // mov byte ptr [rdx + r11], al LONG $0x44b60f42; WORD $0x011e // movzx eax, byte ptr [rsi + r11 + 1] LONG $0x1f440242; BYTE $0x01 // add al, byte ptr [rdi + r11 + 1] LONG $0x1a448842; BYTE $0x01 // mov byte ptr [rdx + r11 + 1], al LONG $0x44b60f42; WORD $0x021e // movzx eax, byte ptr [rsi + r11 + 2] LONG $0x1f440242; BYTE $0x02 // add al, byte ptr [rdi + r11 + 2] LONG $0x1a448842; BYTE $0x02 // mov byte ptr [rdx + r11 + 2], al LONG $0x44b60f42; WORD $0x031e // movzx eax, byte ptr [rsi + r11 + 3] LONG $0x1f440242; BYTE $0x03 // add al, byte ptr [rdi + r11 + 3] LONG $0x1a448842; BYTE $0x03 // mov byte ptr [rdx + r11 + 3], al LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB3_17 LBB3_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB3_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB3_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB3_14 LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB3_7 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB3_11 LBB3_7: WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x7fe18341 // and r9d, 127 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB3_8: LONG $0x6f7ea1c4; WORD $0x1604 // vmovdqu ymm0, ymmword ptr [rsi + r10] LONG $0x6f7ea1c4; WORD $0x164c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + r10 + 32] LONG $0x6f7ea1c4; WORD $0x1654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + r10 + 64] LONG $0x6f7ea1c4; WORD $0x165c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + r10 + 96] LONG $0xfc7da1c4; WORD $0x1704 // vpaddb ymm0, ymm0, ymmword ptr [rdi + r10] LONG $0xfc75a1c4; WORD $0x174c; BYTE $0x20 // vpaddb ymm1, ymm1, ymmword ptr [rdi + r10 + 32] LONG $0xfc6da1c4; WORD $0x1754; BYTE $0x40 // vpaddb ymm2, ymm2, ymmword ptr [rdi + r10 + 64] LONG $0xfc65a1c4; WORD $0x175c; BYTE $0x60 // vpaddb ymm3, ymm3, ymmword ptr [rdi + r10 + 96] LONG $0x7f7ea1c4; WORD $0x1204 // vmovdqu ymmword ptr [rdx + r10], ymm0 LONG $0x7f7ea1c4; WORD $0x124c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x1254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x125c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + r10 + 96], ymm3 LONG $0x80ea8349 // sub r10, -128 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB3_8 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB3_18 LONG $0x10f98341 // cmp r9d, 16 JB LBB3_14 LBB3_11: WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x0fe28341 // and r10d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xd3 // sub r11, r10 LBB3_12: LONG $0x6f7aa1c4; WORD $0x0e04 // vmovdqu xmm0, xmmword ptr [rsi + r9] LONG $0xfc79a1c4; WORD $0x0f04 // vpaddb xmm0, xmm0, xmmword ptr [rdi + r9] LONG $0x7f7aa1c4; WORD $0x0a04 // vmovdqu xmmword ptr [rdx + r9], xmm0 LONG $0x10c18349 // add r9, 16 WORD $0x394d; BYTE $0xcb // cmp r11, r9 JNE LBB3_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB3_14 JMP LBB3_18 TEXT ·_uint8_sub(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB4_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB4_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB4_14: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB4_16 LBB4_15: LONG $0x04b60f42; BYTE $0x1f // movzx eax, byte ptr [rdi + r11] LONG $0x1e042a42 // sub al, byte ptr [rsi + r11] LONG $0x1a048842 // mov byte ptr [rdx + r11], al WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB4_15 LBB4_16: LONG $0x03f98349 // cmp r9, 3 JB LBB4_18 LBB4_17: LONG $0x04b60f42; BYTE $0x1f // movzx eax, byte ptr [rdi + r11] LONG $0x1e042a42 // sub al, byte ptr [rsi + r11] LONG $0x1a048842 // mov byte ptr [rdx + r11], al LONG $0x44b60f42; WORD $0x011f // movzx eax, byte ptr [rdi + r11 + 1] LONG $0x1e442a42; BYTE $0x01 // sub al, byte ptr [rsi + r11 + 1] LONG $0x1a448842; BYTE $0x01 // mov byte ptr [rdx + r11 + 1], al LONG $0x44b60f42; WORD $0x021f // movzx eax, byte ptr [rdi + r11 + 2] LONG $0x1e442a42; BYTE $0x02 // sub al, byte ptr [rsi + r11 + 2] LONG $0x1a448842; BYTE $0x02 // mov byte ptr [rdx + r11 + 2], al LONG $0x44b60f42; WORD $0x031f // movzx eax, byte ptr [rdi + r11 + 3] LONG $0x1e442a42; BYTE $0x03 // sub al, byte ptr [rsi + r11 + 3] LONG $0x1a448842; BYTE $0x03 // mov byte ptr [rdx + r11 + 3], al LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB4_17 LBB4_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB4_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB4_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB4_14 LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB4_7 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB4_11 LBB4_7: WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x7fe18341 // and r9d, 127 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB4_8: LONG $0x6f7ea1c4; WORD $0x1704 // vmovdqu ymm0, ymmword ptr [rdi + r10] LONG $0x6f7ea1c4; WORD $0x174c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rdi + r10 + 32] LONG $0x6f7ea1c4; WORD $0x1754; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rdi + r10 + 64] LONG $0x6f7ea1c4; WORD $0x175c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rdi + r10 + 96] LONG $0xf87da1c4; WORD $0x1604 // vpsubb ymm0, ymm0, ymmword ptr [rsi + r10] LONG $0xf875a1c4; WORD $0x164c; BYTE $0x20 // vpsubb ymm1, ymm1, ymmword ptr [rsi + r10 + 32] LONG $0xf86da1c4; WORD $0x1654; BYTE $0x40 // vpsubb ymm2, ymm2, ymmword ptr [rsi + r10 + 64] LONG $0xf865a1c4; WORD $0x165c; BYTE $0x60 // vpsubb ymm3, ymm3, ymmword ptr [rsi + r10 + 96] LONG $0x7f7ea1c4; WORD $0x1204 // vmovdqu ymmword ptr [rdx + r10], ymm0 LONG $0x7f7ea1c4; WORD $0x124c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x1254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x125c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + r10 + 96], ymm3 LONG $0x80ea8349 // sub r10, -128 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB4_8 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB4_18 LONG $0x10f98341 // cmp r9d, 16 JB LBB4_14 LBB4_11: WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x0fe28341 // and r10d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xd3 // sub r11, r10 LBB4_12: LONG $0x6f7aa1c4; WORD $0x0f04 // vmovdqu xmm0, xmmword ptr [rdi + r9] LONG $0xf879a1c4; WORD $0x0e04 // vpsubb xmm0, xmm0, xmmword ptr [rsi + r9] LONG $0x7f7aa1c4; WORD $0x0a04 // vmovdqu xmmword ptr [rdx + r9], xmm0 LONG $0x10c18349 // add r9, 16 WORD $0x394d; BYTE $0xcb // cmp r11, r9 JNE LBB4_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB4_14 JMP LBB4_18 LCPI5_0: // TEXT ·_uint8_mul(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB5_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB5_3 WORD $0x3145; BYTE $0xc9 // xor r9d, r9d LBB5_14: WORD $0x2944; BYTE $0xc9 // sub ecx, r9d WORD $0x894d; BYTE $0xca // mov r10, r9 WORD $0xf749; BYTE $0xd2 // not r10 WORD $0x014d; BYTE $0xc2 // add r10, r8 LONG $0x03e18348 // and rcx, 3 JE LBB5_16 LBB5_15: LONG $0x04b60f42; BYTE $0x0e // movzx eax, byte ptr [rsi + r9] LONG $0x0f24f642 // mul byte ptr [rdi + r9] LONG $0x0a048842 // mov byte ptr [rdx + r9], al WORD $0xff49; BYTE $0xc1 // inc r9 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB5_15 LBB5_16: LONG $0x03fa8349 // cmp r10, 3 JB LBB5_18 LBB5_17: LONG $0x04b60f42; BYTE $0x0e // movzx eax, byte ptr [rsi + r9] LONG $0x0f24f642 // mul byte ptr [rdi + r9] LONG $0x0a048842 // mov byte ptr [rdx + r9], al LONG $0x44b60f42; WORD $0x010e // movzx eax, byte ptr [rsi + r9 + 1] LONG $0x0f64f642; BYTE $0x01 // mul byte ptr [rdi + r9 + 1] LONG $0x0a448842; BYTE $0x01 // mov byte ptr [rdx + r9 + 1], al LONG $0x44b60f42; WORD $0x020e // movzx eax, byte ptr [rsi + r9 + 2] LONG $0x0f64f642; BYTE $0x02 // mul byte ptr [rdi + r9 + 2] LONG $0x0a448842; BYTE $0x02 // mov byte ptr [rdx + r9 + 2], al LONG $0x44b60f42; WORD $0x030e // movzx eax, byte ptr [rsi + r9 + 3] LONG $0x0f64f642; BYTE $0x03 // mul byte ptr [rdi + r9 + 3] LONG $0x0a448842; BYTE $0x03 // mov byte ptr [rdx + r9 + 3], al LONG $0x04c18349 // add r9, 4 WORD $0x394d; BYTE $0xc8 // cmp r8, r9 JNE LBB5_17 LBB5_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB5_3: WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xc9 // xor r9d, r9d LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB5_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB5_14 LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB5_7 WORD $0x3145; BYTE $0xc9 // xor r9d, r9d JMP LBB5_11 LBB5_7: WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x7fe28341 // and r10d, 127 WORD $0x894d; BYTE $0xc1 // mov r9, r8 WORD $0x294d; BYTE $0xd1 // sub r9, r10 WORD $0xc031 // xor eax, eax QUAD $0x0000014b056ffec5 // vmovdqu ymm0, ymmword ptr [rip + .LCPI5_0] LBB5_8: LONG $0x1c6ffec5; BYTE $0x07 // vmovdqu ymm3, ymmword ptr [rdi + rax] LONG $0x646ffec5; WORD $0x2007 // vmovdqu ymm4, ymmword ptr [rdi + rax + 32] LONG $0x6c6ffec5; WORD $0x4007 // vmovdqu ymm5, ymmword ptr [rdi + rax + 64] LONG $0x4c6ffec5; WORD $0x6007 // vmovdqu ymm1, ymmword ptr [rdi + rax + 96] LONG $0x346ffec5; BYTE $0x06 // vmovdqu ymm6, ymmword ptr [rsi + rax] LONG $0x7c6ffec5; WORD $0x2006 // vmovdqu ymm7, ymmword ptr [rsi + rax + 32] LONG $0x446f7ec5; WORD $0x4006 // vmovdqu ymm8, ymmword ptr [rsi + rax + 64] LONG $0x546ffec5; WORD $0x6006 // vmovdqu ymm2, ymmword ptr [rsi + rax + 96] LONG $0xcb6865c5 // vpunpckhbw ymm9, ymm3, ymm3 LONG $0xd6684dc5 // vpunpckhbw ymm10, ymm6, ymm6 LONG $0xd52d41c4; BYTE $0xc9 // vpmullw ymm9, ymm10, ymm9 LONG $0xc8db35c5 // vpand ymm9, ymm9, ymm0 LONG $0xdb60e5c5 // vpunpcklbw ymm3, ymm3, ymm3 LONG $0xf660cdc5 // vpunpcklbw ymm6, ymm6, ymm6 LONG $0xdbd5cdc5 // vpmullw ymm3, ymm6, ymm3 LONG $0xd8dbe5c5 // vpand ymm3, ymm3, ymm0 LONG $0x6765c1c4; BYTE $0xd9 // vpackuswb ymm3, ymm3, ymm9 LONG $0xf468ddc5 // vpunpckhbw ymm6, ymm4, ymm4 LONG $0xcf6845c5 // vpunpckhbw ymm9, ymm7, ymm7 LONG $0xf6d5b5c5 // vpmullw ymm6, ymm9, ymm6 LONG $0xf0dbcdc5 // vpand ymm6, ymm6, ymm0 LONG $0xe460ddc5 // vpunpcklbw ymm4, ymm4, ymm4 LONG $0xff60c5c5 // vpunpcklbw ymm7, ymm7, ymm7 LONG $0xe4d5c5c5 // vpmullw ymm4, ymm7, ymm4 LONG $0xe0dbddc5 // vpand ymm4, ymm4, ymm0 LONG $0xe667ddc5 // vpackuswb ymm4, ymm4, ymm6 LONG $0xf568d5c5 // vpunpckhbw ymm6, ymm5, ymm5 LONG $0x683dc1c4; BYTE $0xf8 // vpunpckhbw ymm7, ymm8, ymm8 LONG $0xf6d5c5c5 // vpmullw ymm6, ymm7, ymm6 LONG $0xf0dbcdc5 // vpand ymm6, ymm6, ymm0 LONG $0xed60d5c5 // vpunpcklbw ymm5, ymm5, ymm5 LONG $0x603dc1c4; BYTE $0xf8 // vpunpcklbw ymm7, ymm8, ymm8 LONG $0xedd5c5c5 // vpmullw ymm5, ymm7, ymm5 LONG $0xe8dbd5c5 // vpand ymm5, ymm5, ymm0 LONG $0xee67d5c5 // vpackuswb ymm5, ymm5, ymm6 LONG $0xf168f5c5 // vpunpckhbw ymm6, ymm1, ymm1 LONG $0xfa68edc5 // vpunpckhbw ymm7, ymm2, ymm2 LONG $0xf6d5c5c5 // vpmullw ymm6, ymm7, ymm6 LONG $0xf0dbcdc5 // vpand ymm6, ymm6, ymm0 LONG $0xc960f5c5 // vpunpcklbw ymm1, ymm1, ymm1 LONG $0xd260edc5 // vpunpcklbw ymm2, ymm2, ymm2 LONG $0xc9d5edc5 // vpmullw ymm1, ymm2, ymm1 LONG $0xc8dbf5c5 // vpand ymm1, ymm1, ymm0 LONG $0xce67f5c5 // vpackuswb ymm1, ymm1, ymm6 LONG $0x1c7ffec5; BYTE $0x02 // vmovdqu ymmword ptr [rdx + rax], ymm3 LONG $0x647ffec5; WORD $0x2002 // vmovdqu ymmword ptr [rdx + rax + 32], ymm4 LONG $0x6c7ffec5; WORD $0x4002 // vmovdqu ymmword ptr [rdx + rax + 64], ymm5 LONG $0x4c7ffec5; WORD $0x6002 // vmovdqu ymmword ptr [rdx + rax + 96], ymm1 LONG $0x80e88348 // sub rax, -128 WORD $0x3949; BYTE $0xc1 // cmp r9, rax JNE LBB5_8 WORD $0x854d; BYTE $0xd2 // test r10, r10 JE LBB5_18 LONG $0x10fa8341 // cmp r10d, 16 JB LBB5_14 LBB5_11: WORD $0x894c; BYTE $0xc8 // mov rax, r9 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x0fe28341 // and r10d, 15 WORD $0x894d; BYTE $0xc1 // mov r9, r8 WORD $0x294d; BYTE $0xd1 // sub r9, r10 QUAD $0x0000003a056ffec5 // vmovdqu ymm0, ymmword ptr [rip + .LCPI5_0] LBB5_12: LONG $0x307de2c4; WORD $0x070c // vpmovzxbw ymm1, xmmword ptr [rdi + rax] LONG $0x307de2c4; WORD $0x0614 // vpmovzxbw ymm2, xmmword ptr [rsi + rax] LONG $0xc9d5edc5 // vpmullw ymm1, ymm2, ymm1 LONG $0xc8dbf5c5 // vpand ymm1, ymm1, ymm0 LONG $0x397de3c4; WORD $0x01ca // vextracti128 xmm2, ymm1, 1 LONG $0xca67f1c5 // vpackuswb xmm1, xmm1, xmm2 LONG $0x0c7ffac5; BYTE $0x02 // vmovdqu xmmword ptr [rdx + rax], xmm1 LONG $0x10c08348 // add rax, 16 WORD $0x3949; BYTE $0xc1 // cmp r9, rax JNE LBB5_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB5_14 JMP LBB5_18 WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff TEXT ·_uint8_div(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp WORD $0x5741 // push r15 WORD $0x5641 // push r14 BYTE $0x53 // push rbx LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB6_12 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x20f98349 // cmp r9, 32 JAE LBB6_3 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB6_8 LBB6_3: WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LONG $0x20f88348 // cmp rax, 32 JB LBB6_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x20f88348 // cmp rax, 32 JB LBB6_8 WORD $0x8941; BYTE $0xcb // mov r11d, ecx LONG $0x1fe38341 // and r11d, 31 WORD $0x894d; BYTE $0xca // mov r10, r9 WORD $0x294d; BYTE $0xda // sub r10, r11 WORD $0x3145; BYTE $0xf6 // xor r14d, r14d LBB6_6: LONG $0x6f7aa1c4; WORD $0x3614 // vmovdqu xmm2, xmmword ptr [rsi + r14] LONG $0x6f7aa1c4; WORD $0x3644; BYTE $0x10 // vmovdqu xmm0, xmmword ptr [rsi + r14 + 16] LONG $0x1479e3c4; WORD $0x01d3 // vpextrb ebx, xmm2, 1 LONG $0x6f7aa1c4; WORD $0x371c // vmovdqu xmm3, xmmword ptr [rdi + r14] LONG $0x6f7aa1c4; WORD $0x374c; BYTE $0x10 // vmovdqu xmm1, xmmword ptr [rdi + r14 + 16] LONG $0x1479e3c4; WORD $0x01d8 // vpextrb eax, xmm3, 1 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl LONG $0xc0b60f44 // movzx r8d, al LONG $0xd37ef9c5 // vmovd ebx, xmm2 LONG $0xd87ef9c5 // vmovd eax, xmm3 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0xe06ef9c5 // vmovd xmm4, eax LONG $0x2059c3c4; WORD $0x01e0 // vpinsrb xmm4, xmm4, r8d, 1 LONG $0x1479e3c4; WORD $0x02d3 // vpextrb ebx, xmm2, 2 LONG $0x1479e3c4; WORD $0x02d8 // vpextrb eax, xmm3, 2 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x02e0 // vpinsrb xmm4, xmm4, eax, 2 LONG $0x1479e3c4; WORD $0x03d3 // vpextrb ebx, xmm2, 3 LONG $0x1479e3c4; WORD $0x03d8 // vpextrb eax, xmm3, 3 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x03e0 // vpinsrb xmm4, xmm4, eax, 3 LONG $0x1479e3c4; WORD $0x04d3 // vpextrb ebx, xmm2, 4 LONG $0x1479e3c4; WORD $0x04d8 // vpextrb eax, xmm3, 4 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x04e0 // vpinsrb xmm4, xmm4, eax, 4 LONG $0x1479e3c4; WORD $0x05d3 // vpextrb ebx, xmm2, 5 LONG $0x1479e3c4; WORD $0x05d8 // vpextrb eax, xmm3, 5 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x05e0 // vpinsrb xmm4, xmm4, eax, 5 LONG $0x1479e3c4; WORD $0x06d3 // vpextrb ebx, xmm2, 6 LONG $0x1479e3c4; WORD $0x06d8 // vpextrb eax, xmm3, 6 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x06e0 // vpinsrb xmm4, xmm4, eax, 6 LONG $0x1479e3c4; WORD $0x07d3 // vpextrb ebx, xmm2, 7 LONG $0x1479e3c4; WORD $0x07d8 // vpextrb eax, xmm3, 7 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x07e0 // vpinsrb xmm4, xmm4, eax, 7 LONG $0x1479e3c4; WORD $0x08d3 // vpextrb ebx, xmm2, 8 LONG $0x1479e3c4; WORD $0x08d8 // vpextrb eax, xmm3, 8 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x08e0 // vpinsrb xmm4, xmm4, eax, 8 LONG $0x1479e3c4; WORD $0x09d3 // vpextrb ebx, xmm2, 9 LONG $0x1479e3c4; WORD $0x09d8 // vpextrb eax, xmm3, 9 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x09e0 // vpinsrb xmm4, xmm4, eax, 9 LONG $0x1479e3c4; WORD $0x0ad3 // vpextrb ebx, xmm2, 10 LONG $0x1479e3c4; WORD $0x0ad8 // vpextrb eax, xmm3, 10 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x0ae0 // vpinsrb xmm4, xmm4, eax, 10 LONG $0x1479e3c4; WORD $0x0bd3 // vpextrb ebx, xmm2, 11 LONG $0x1479e3c4; WORD $0x0bd8 // vpextrb eax, xmm3, 11 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x0be0 // vpinsrb xmm4, xmm4, eax, 11 LONG $0x1479e3c4; WORD $0x0cd3 // vpextrb ebx, xmm2, 12 LONG $0x1479e3c4; WORD $0x0cd8 // vpextrb eax, xmm3, 12 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x0ce0 // vpinsrb xmm4, xmm4, eax, 12 LONG $0x1479e3c4; WORD $0x0dd3 // vpextrb ebx, xmm2, 13 LONG $0x1479e3c4; WORD $0x0dd8 // vpextrb eax, xmm3, 13 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x0de0 // vpinsrb xmm4, xmm4, eax, 13 LONG $0x1479e3c4; WORD $0x0ed3 // vpextrb ebx, xmm2, 14 LONG $0x1479e3c4; WORD $0x0ed8 // vpextrb eax, xmm3, 14 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x0ee0 // vpinsrb xmm4, xmm4, eax, 14 LONG $0x1479e3c4; WORD $0x0fd3 // vpextrb ebx, xmm2, 15 LONG $0x1479e3c4; WORD $0x0fd8 // vpextrb eax, xmm3, 15 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2059e3c4; WORD $0x0fd0 // vpinsrb xmm2, xmm4, eax, 15 LONG $0x1479e3c4; WORD $0x01c3 // vpextrb ebx, xmm0, 1 LONG $0x1479e3c4; WORD $0x01c8 // vpextrb eax, xmm1, 1 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl LONG $0xc0b60f44 // movzx r8d, al LONG $0xc37ef9c5 // vmovd ebx, xmm0 LONG $0xc87ef9c5 // vmovd eax, xmm1 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0xd86ef9c5 // vmovd xmm3, eax LONG $0x2061c3c4; WORD $0x01d8 // vpinsrb xmm3, xmm3, r8d, 1 LONG $0x1479e3c4; WORD $0x02c3 // vpextrb ebx, xmm0, 2 LONG $0x1479e3c4; WORD $0x02c8 // vpextrb eax, xmm1, 2 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x02d8 // vpinsrb xmm3, xmm3, eax, 2 LONG $0x1479e3c4; WORD $0x03c3 // vpextrb ebx, xmm0, 3 LONG $0x1479e3c4; WORD $0x03c8 // vpextrb eax, xmm1, 3 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x03d8 // vpinsrb xmm3, xmm3, eax, 3 LONG $0x1479e3c4; WORD $0x04c3 // vpextrb ebx, xmm0, 4 LONG $0x1479e3c4; WORD $0x04c8 // vpextrb eax, xmm1, 4 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x04d8 // vpinsrb xmm3, xmm3, eax, 4 LONG $0x1479e3c4; WORD $0x05c3 // vpextrb ebx, xmm0, 5 LONG $0x1479e3c4; WORD $0x05c8 // vpextrb eax, xmm1, 5 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x05d8 // vpinsrb xmm3, xmm3, eax, 5 LONG $0x1479e3c4; WORD $0x06c3 // vpextrb ebx, xmm0, 6 LONG $0x1479e3c4; WORD $0x06c8 // vpextrb eax, xmm1, 6 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x06d8 // vpinsrb xmm3, xmm3, eax, 6 LONG $0x1479e3c4; WORD $0x07c3 // vpextrb ebx, xmm0, 7 LONG $0x1479e3c4; WORD $0x07c8 // vpextrb eax, xmm1, 7 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x07d8 // vpinsrb xmm3, xmm3, eax, 7 LONG $0x1479e3c4; WORD $0x08c3 // vpextrb ebx, xmm0, 8 LONG $0x1479e3c4; WORD $0x08c8 // vpextrb eax, xmm1, 8 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x08d8 // vpinsrb xmm3, xmm3, eax, 8 LONG $0x1479e3c4; WORD $0x09c3 // vpextrb ebx, xmm0, 9 LONG $0x1479e3c4; WORD $0x09c8 // vpextrb eax, xmm1, 9 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x09d8 // vpinsrb xmm3, xmm3, eax, 9 LONG $0x1479e3c4; WORD $0x0ac3 // vpextrb ebx, xmm0, 10 LONG $0x1479e3c4; WORD $0x0ac8 // vpextrb eax, xmm1, 10 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x0ad8 // vpinsrb xmm3, xmm3, eax, 10 LONG $0x1479e3c4; WORD $0x0bc3 // vpextrb ebx, xmm0, 11 LONG $0x1479e3c4; WORD $0x0bc8 // vpextrb eax, xmm1, 11 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x0bd8 // vpinsrb xmm3, xmm3, eax, 11 LONG $0x1479e3c4; WORD $0x0cc3 // vpextrb ebx, xmm0, 12 LONG $0x1479e3c4; WORD $0x0cc8 // vpextrb eax, xmm1, 12 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x0cd8 // vpinsrb xmm3, xmm3, eax, 12 LONG $0x1479e3c4; WORD $0x0dc3 // vpextrb ebx, xmm0, 13 LONG $0x1479e3c4; WORD $0x0dc8 // vpextrb eax, xmm1, 13 WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xf3f6 // div bl WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2061e3c4; WORD $0x0dd8 // vpinsrb xmm3, xmm3, eax, 13 LONG $0x1479c3c4; WORD $0x0ec0 // vpextrb r8d, xmm0, 14 LONG $0x1479e3c4; WORD $0x0ec8 // vpextrb eax, xmm1, 14 LONG $0x1479c3c4; WORD $0x0fc7 // vpextrb r15d, xmm0, 15 WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x1479e3c4; WORD $0x0fcb // vpextrb ebx, xmm1, 15 WORD $0xf641; BYTE $0xf0 // div r8b WORD $0x8941; BYTE $0xc0 // mov r8d, eax WORD $0xb60f; BYTE $0xc3 // movzx eax, bl WORD $0xf641; BYTE $0xf7 // div r15b LONG $0xd8b60f41 // movzx ebx, r8b LONG $0x2061e3c4; WORD $0x0ec3 // vpinsrb xmm0, xmm3, ebx, 14 WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0x2079e3c4; WORD $0x0fc0 // vpinsrb xmm0, xmm0, eax, 15 LONG $0x7f7aa1c4; WORD $0x3244; BYTE $0x10 // vmovdqu xmmword ptr [rdx + r14 + 16], xmm0 LONG $0x7f7aa1c4; WORD $0x3214 // vmovdqu xmmword ptr [rdx + r14], xmm2 LONG $0x20c68349 // add r14, 32 WORD $0x394d; BYTE $0xf2 // cmp r10, r14 JNE LBB6_6 WORD $0x854d; BYTE $0xdb // test r11, r11 JE LBB6_12 LBB6_8: WORD $0x2944; BYTE $0xd1 // sub ecx, r10d LONG $0x015a8d49 // lea rbx, [r10 + 1] WORD $0xc1f6; BYTE $0x01 // test cl, 1 JE LBB6_10 LONG $0x04b60f42; BYTE $0x17 // movzx eax, byte ptr [rdi + r10] LONG $0x1634f642 // div byte ptr [rsi + r10] LONG $0x12048842 // mov byte ptr [rdx + r10], al WORD $0x8949; BYTE $0xda // mov r10, rbx LBB6_10: WORD $0x3949; BYTE $0xd9 // cmp r9, rbx JE LBB6_12 LBB6_11: LONG $0x04b60f42; BYTE $0x17 // movzx eax, byte ptr [rdi + r10] LONG $0x1634f642 // div byte ptr [rsi + r10] LONG $0x12048842 // mov byte ptr [rdx + r10], al LONG $0x44b60f42; WORD $0x0117 // movzx eax, byte ptr [rdi + r10 + 1] LONG $0x1674f642; BYTE $0x01 // div byte ptr [rsi + r10 + 1] LONG $0x12448842; BYTE $0x01 // mov byte ptr [rdx + r10 + 1], al LONG $0x02c28349 // add r10, 2 WORD $0x394d; BYTE $0xd1 // cmp r9, r10 JNE LBB6_11 LBB6_12: LONG $0xe8658d48 // lea rsp, [rbp - 24] BYTE $0x5b // pop rbx WORD $0x5e41 // pop r14 WORD $0x5f41 // pop r15 BYTE $0x5d // pop rbp BYTE $0xc3 // ret TEXT ·_uint16_sum(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xd285 // test edx, edx JLE LBB7_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x08f88349 // cmp r8, 8 JAE LBB7_4 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d WORD $0xc031 // xor eax, eax JMP LBB7_13 LBB7_1: WORD $0xc031 // xor eax, eax JMP LBB7_14 LBB7_4: LONG $0x40f88341 // cmp r8d, 64 JAE LBB7_6 WORD $0xc031 // xor eax, eax WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB7_10 LBB7_6: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 LBB7_7: LONG $0x04fdfdc5; BYTE $0x47 // vpaddw ymm0, ymm0, ymmword ptr [rdi + 2*rax] LONG $0x4cfdf5c5; WORD $0x2047 // vpaddw ymm1, ymm1, ymmword ptr [rdi + 2*rax + 32] LONG $0x54fdedc5; WORD $0x4047 // vpaddw ymm2, ymm2, ymmword ptr [rdi + 2*rax + 64] LONG $0x5cfde5c5; WORD $0x6047 // vpaddw ymm3, ymm3, ymmword ptr [rdi + 2*rax + 96] LONG $0x40c08348 // add rax, 64 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB7_7 LONG $0xc0fdf5c5 // vpaddw ymm0, ymm1, ymm0 LONG $0xc0fdedc5 // vpaddw ymm0, ymm2, ymm0 LONG $0xc0fde5c5 // vpaddw ymm0, ymm3, ymm0 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x55 // vpshufd xmm1, xmm0, 85 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xd072f1c5; BYTE $0x10 // vpsrld xmm1, xmm0, 16 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB7_14 LONG $0x08f98341 // cmp r9d, 8 JB LBB7_13 LBB7_10: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x07 // and edx, 7 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx WORD $0xb70f; BYTE $0xc0 // movzx eax, ax LONG $0xc06ef9c5 // vmovd xmm0, eax LBB7_11: LONG $0x04fdf9c5; BYTE $0x4f // vpaddw xmm0, xmm0, xmmword ptr [rdi + 2*rcx] LONG $0x08c18348 // add rcx, 8 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB7_11 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x55 // vpshufd xmm1, xmm0, 85 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xd072f1c5; BYTE $0x10 // vpsrld xmm1, xmm0, 16 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB7_14 LBB7_13: LONG $0x04034266; BYTE $0x57 // add ax, word ptr [rdi + 2*r10] WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB7_13 LBB7_14: WORD $0x8966; BYTE $0x06 // mov word ptr [rsi], ax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint16_min(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xb70f; BYTE $0x07 // movzx eax, word ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB8_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x08f88349 // cmp r8, 8 JAE LBB8_4 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB8_14 LBB8_1: WORD $0xc189 // mov ecx, eax LBB8_15: WORD $0x8966; BYTE $0x0e // mov word ptr [rsi], cx WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB8_4: LONG $0x40f88341 // cmp r8d, 64 JAE LBB8_6 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB8_11 LBB8_6: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB8_7: LONG $0x3a7de2c4; WORD $0x4704 // vpminuw ymm0, ymm0, ymmword ptr [rdi + 2*rax] LONG $0x3a75e2c4; WORD $0x474c; BYTE $0x20 // vpminuw ymm1, ymm1, ymmword ptr [rdi + 2*rax + 32] LONG $0x3a6de2c4; WORD $0x4754; BYTE $0x40 // vpminuw ymm2, ymm2, ymmword ptr [rdi + 2*rax + 64] LONG $0x3a65e2c4; WORD $0x475c; BYTE $0x60 // vpminuw ymm3, ymm3, ymmword ptr [rdi + 2*rax + 96] LONG $0x40c08348 // add rax, 64 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB8_7 LONG $0x3a7de2c4; BYTE $0xc1 // vpminuw ymm0, ymm0, ymm1 LONG $0x3a7de2c4; BYTE $0xc2 // vpminuw ymm0, ymm0, ymm2 LONG $0x3a7de2c4; BYTE $0xc3 // vpminuw ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0x3a79e2c4; BYTE $0xc1 // vpminuw xmm0, xmm0, xmm1 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB8_9 LONG $0x08f98341 // cmp r9d, 8 JB LBB8_14 LBB8_11: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x07 // and edx, 7 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x7979e2c4; BYTE $0xc0 // vpbroadcastw xmm0, xmm0 LBB8_12: LONG $0x3a79e2c4; WORD $0x4f04 // vpminuw xmm0, xmm0, xmmword ptr [rdi + 2*rcx] LONG $0x08c18348 // add rcx, 8 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB8_12 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0xc189 // mov ecx, eax WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB8_15 LBB8_14: LONG $0x0cb70f42; BYTE $0x57 // movzx ecx, word ptr [rdi + 2*r10] WORD $0x3966; BYTE $0xc1 // cmp cx, ax WORD $0x430f; BYTE $0xc8 // cmovae ecx, eax WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0xc889 // mov eax, ecx WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB8_14 JMP LBB8_15 LBB8_9: WORD $0xc189 // mov ecx, eax WORD $0x8966; BYTE $0x0e // mov word ptr [rsi], cx WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint16_max(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xb70f; BYTE $0x07 // movzx eax, word ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB9_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x08f88349 // cmp r8, 8 JAE LBB9_4 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB9_14 LBB9_1: WORD $0xc189 // mov ecx, eax LBB9_15: WORD $0x8966; BYTE $0x0e // mov word ptr [rsi], cx WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB9_4: LONG $0x40f88341 // cmp r8d, 64 JAE LBB9_6 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB9_11 LBB9_6: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB9_7: LONG $0x3e7de2c4; WORD $0x4704 // vpmaxuw ymm0, ymm0, ymmword ptr [rdi + 2*rax] LONG $0x3e75e2c4; WORD $0x474c; BYTE $0x20 // vpmaxuw ymm1, ymm1, ymmword ptr [rdi + 2*rax + 32] LONG $0x3e6de2c4; WORD $0x4754; BYTE $0x40 // vpmaxuw ymm2, ymm2, ymmword ptr [rdi + 2*rax + 64] LONG $0x3e65e2c4; WORD $0x475c; BYTE $0x60 // vpmaxuw ymm3, ymm3, ymmword ptr [rdi + 2*rax + 96] LONG $0x40c08348 // add rax, 64 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB9_7 LONG $0x3e7de2c4; BYTE $0xc1 // vpmaxuw ymm0, ymm0, ymm1 LONG $0x3e7de2c4; BYTE $0xc2 // vpmaxuw ymm0, ymm0, ymm2 LONG $0x3e7de2c4; BYTE $0xc3 // vpmaxuw ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0x3e79e2c4; BYTE $0xc1 // vpmaxuw xmm0, xmm0, xmm1 LONG $0xc976f1c5 // vpcmpeqd xmm1, xmm1, xmm1 LONG $0xc1eff9c5 // vpxor xmm0, xmm0, xmm1 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0xd0f7 // not eax WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB9_9 LONG $0x08f98341 // cmp r9d, 8 JB LBB9_14 LBB9_11: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x07 // and edx, 7 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x7979e2c4; BYTE $0xc0 // vpbroadcastw xmm0, xmm0 LBB9_12: LONG $0x3e79e2c4; WORD $0x4f04 // vpmaxuw xmm0, xmm0, xmmword ptr [rdi + 2*rcx] LONG $0x08c18348 // add rcx, 8 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB9_12 LONG $0xc976f1c5 // vpcmpeqd xmm1, xmm1, xmm1 LONG $0xc1eff9c5 // vpxor xmm0, xmm0, xmm1 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0xd0f7 // not eax WORD $0xc189 // mov ecx, eax WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB9_15 LBB9_14: LONG $0x0cb70f42; BYTE $0x57 // movzx ecx, word ptr [rdi + 2*r10] WORD $0x3966; BYTE $0xc1 // cmp cx, ax WORD $0x460f; BYTE $0xc8 // cmovbe ecx, eax WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0xc889 // mov eax, ecx WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB9_14 JMP LBB9_15 LBB9_9: WORD $0xc189 // mov ecx, eax WORD $0x8966; BYTE $0x0e // mov word ptr [rsi], cx WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint16_add(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB10_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x08f88349 // cmp r8, 8 JAE LBB10_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB10_14: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB10_16 LBB10_15: LONG $0x04b70f42; BYTE $0x5e // movzx eax, word ptr [rsi + 2*r11] LONG $0x04034266; BYTE $0x5f // add ax, word ptr [rdi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB10_15 LBB10_16: LONG $0x03f98349 // cmp r9, 3 JB LBB10_18 LBB10_17: LONG $0x04b70f42; BYTE $0x5e // movzx eax, word ptr [rsi + 2*r11] LONG $0x04034266; BYTE $0x5f // add ax, word ptr [rdi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax LONG $0x44b70f42; WORD $0x025e // movzx eax, word ptr [rsi + 2*r11 + 2] LONG $0x44034266; WORD $0x025f // add ax, word ptr [rdi + 2*r11 + 2] LONG $0x44894266; WORD $0x025a // mov word ptr [rdx + 2*r11 + 2], ax LONG $0x44b70f42; WORD $0x045e // movzx eax, word ptr [rsi + 2*r11 + 4] LONG $0x44034266; WORD $0x045f // add ax, word ptr [rdi + 2*r11 + 4] LONG $0x44894266; WORD $0x045a // mov word ptr [rdx + 2*r11 + 4], ax LONG $0x44b70f42; WORD $0x065e // movzx eax, word ptr [rsi + 2*r11 + 6] LONG $0x44034266; WORD $0x065f // add ax, word ptr [rdi + 2*r11 + 6] LONG $0x44894266; WORD $0x065a // mov word ptr [rdx + 2*r11 + 6], ax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB10_17 LBB10_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB10_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB10_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB10_14 LONG $0x40f88341 // cmp r8d, 64 JAE LBB10_7 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB10_11 LBB10_7: WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB10_8: LONG $0x6f7ea1c4; WORD $0x5604 // vmovdqu ymm0, ymmword ptr [rsi + 2*r10] LONG $0x6f7ea1c4; WORD $0x564c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + 2*r10 + 32] LONG $0x6f7ea1c4; WORD $0x5654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + 2*r10 + 64] LONG $0x6f7ea1c4; WORD $0x565c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + 2*r10 + 96] LONG $0xfd7da1c4; WORD $0x5704 // vpaddw ymm0, ymm0, ymmword ptr [rdi + 2*r10] LONG $0xfd75a1c4; WORD $0x574c; BYTE $0x20 // vpaddw ymm1, ymm1, ymmword ptr [rdi + 2*r10 + 32] LONG $0xfd6da1c4; WORD $0x5754; BYTE $0x40 // vpaddw ymm2, ymm2, ymmword ptr [rdi + 2*r10 + 64] LONG $0xfd65a1c4; WORD $0x575c; BYTE $0x60 // vpaddw ymm3, ymm3, ymmword ptr [rdi + 2*r10 + 96] LONG $0x7f7ea1c4; WORD $0x5204 // vmovdqu ymmword ptr [rdx + 2*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x524c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 2*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x5254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 2*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x525c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 2*r10 + 96], ymm3 LONG $0x40c28349 // add r10, 64 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB10_8 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB10_18 LONG $0x08f98341 // cmp r9d, 8 JB LBB10_14 LBB10_11: WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x07e28341 // and r10d, 7 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xd3 // sub r11, r10 LBB10_12: LONG $0x6f7aa1c4; WORD $0x4e04 // vmovdqu xmm0, xmmword ptr [rsi + 2*r9] LONG $0xfd79a1c4; WORD $0x4f04 // vpaddw xmm0, xmm0, xmmword ptr [rdi + 2*r9] LONG $0x7f7aa1c4; WORD $0x4a04 // vmovdqu xmmword ptr [rdx + 2*r9], xmm0 LONG $0x08c18349 // add r9, 8 WORD $0x394d; BYTE $0xcb // cmp r11, r9 JNE LBB10_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB10_14 JMP LBB10_18 TEXT ·_uint16_sub(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB11_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x08f88349 // cmp r8, 8 JAE LBB11_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB11_14: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB11_16 LBB11_15: LONG $0x04b70f42; BYTE $0x5f // movzx eax, word ptr [rdi + 2*r11] LONG $0x042b4266; BYTE $0x5e // sub ax, word ptr [rsi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB11_15 LBB11_16: LONG $0x03f98349 // cmp r9, 3 JB LBB11_18 LBB11_17: LONG $0x04b70f42; BYTE $0x5f // movzx eax, word ptr [rdi + 2*r11] LONG $0x042b4266; BYTE $0x5e // sub ax, word ptr [rsi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax LONG $0x44b70f42; WORD $0x025f // movzx eax, word ptr [rdi + 2*r11 + 2] LONG $0x442b4266; WORD $0x025e // sub ax, word ptr [rsi + 2*r11 + 2] LONG $0x44894266; WORD $0x025a // mov word ptr [rdx + 2*r11 + 2], ax LONG $0x44b70f42; WORD $0x045f // movzx eax, word ptr [rdi + 2*r11 + 4] LONG $0x442b4266; WORD $0x045e // sub ax, word ptr [rsi + 2*r11 + 4] LONG $0x44894266; WORD $0x045a // mov word ptr [rdx + 2*r11 + 4], ax LONG $0x44b70f42; WORD $0x065f // movzx eax, word ptr [rdi + 2*r11 + 6] LONG $0x442b4266; WORD $0x065e // sub ax, word ptr [rsi + 2*r11 + 6] LONG $0x44894266; WORD $0x065a // mov word ptr [rdx + 2*r11 + 6], ax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB11_17 LBB11_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB11_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB11_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB11_14 LONG $0x40f88341 // cmp r8d, 64 JAE LBB11_7 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB11_11 LBB11_7: WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB11_8: LONG $0x6f7ea1c4; WORD $0x5704 // vmovdqu ymm0, ymmword ptr [rdi + 2*r10] LONG $0x6f7ea1c4; WORD $0x574c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rdi + 2*r10 + 32] LONG $0x6f7ea1c4; WORD $0x5754; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rdi + 2*r10 + 64] LONG $0x6f7ea1c4; WORD $0x575c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rdi + 2*r10 + 96] LONG $0xf97da1c4; WORD $0x5604 // vpsubw ymm0, ymm0, ymmword ptr [rsi + 2*r10] LONG $0xf975a1c4; WORD $0x564c; BYTE $0x20 // vpsubw ymm1, ymm1, ymmword ptr [rsi + 2*r10 + 32] LONG $0xf96da1c4; WORD $0x5654; BYTE $0x40 // vpsubw ymm2, ymm2, ymmword ptr [rsi + 2*r10 + 64] LONG $0xf965a1c4; WORD $0x565c; BYTE $0x60 // vpsubw ymm3, ymm3, ymmword ptr [rsi + 2*r10 + 96] LONG $0x7f7ea1c4; WORD $0x5204 // vmovdqu ymmword ptr [rdx + 2*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x524c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 2*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x5254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 2*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x525c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 2*r10 + 96], ymm3 LONG $0x40c28349 // add r10, 64 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB11_8 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB11_18 LONG $0x08f98341 // cmp r9d, 8 JB LBB11_14 LBB11_11: WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x07e28341 // and r10d, 7 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xd3 // sub r11, r10 LBB11_12: LONG $0x6f7aa1c4; WORD $0x4f04 // vmovdqu xmm0, xmmword ptr [rdi + 2*r9] LONG $0xf979a1c4; WORD $0x4e04 // vpsubw xmm0, xmm0, xmmword ptr [rsi + 2*r9] LONG $0x7f7aa1c4; WORD $0x4a04 // vmovdqu xmmword ptr [rdx + 2*r9], xmm0 LONG $0x08c18349 // add r9, 8 WORD $0x394d; BYTE $0xcb // cmp r11, r9 JNE LBB11_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB11_14 JMP LBB11_18 TEXT ·_uint16_mul(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB12_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x08f88349 // cmp r8, 8 JAE LBB12_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB12_14: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB12_16 LBB12_15: LONG $0x04b70f42; BYTE $0x5e // movzx eax, word ptr [rsi + 2*r11] LONG $0xaf0f4266; WORD $0x5f04 // imul ax, word ptr [rdi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB12_15 LBB12_16: LONG $0x03f98349 // cmp r9, 3 JB LBB12_18 LBB12_17: LONG $0x04b70f42; BYTE $0x5e // movzx eax, word ptr [rsi + 2*r11] LONG $0xaf0f4266; WORD $0x5f04 // imul ax, word ptr [rdi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax LONG $0x44b70f42; WORD $0x025e // movzx eax, word ptr [rsi + 2*r11 + 2] LONG $0xaf0f4266; WORD $0x5f44; BYTE $0x02 // imul ax, word ptr [rdi + 2*r11 + 2] LONG $0x44894266; WORD $0x025a // mov word ptr [rdx + 2*r11 + 2], ax LONG $0x44b70f42; WORD $0x045e // movzx eax, word ptr [rsi + 2*r11 + 4] LONG $0xaf0f4266; WORD $0x5f44; BYTE $0x04 // imul ax, word ptr [rdi + 2*r11 + 4] LONG $0x44894266; WORD $0x045a // mov word ptr [rdx + 2*r11 + 4], ax LONG $0x44b70f42; WORD $0x065e // movzx eax, word ptr [rsi + 2*r11 + 6] LONG $0xaf0f4266; WORD $0x5f44; BYTE $0x06 // imul ax, word ptr [rdi + 2*r11 + 6] LONG $0x44894266; WORD $0x065a // mov word ptr [rdx + 2*r11 + 6], ax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB12_17 LBB12_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB12_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB12_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB12_14 LONG $0x40f88341 // cmp r8d, 64 JAE LBB12_7 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB12_11 LBB12_7: WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB12_8: LONG $0x6f7ea1c4; WORD $0x5604 // vmovdqu ymm0, ymmword ptr [rsi + 2*r10] LONG $0x6f7ea1c4; WORD $0x564c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + 2*r10 + 32] LONG $0x6f7ea1c4; WORD $0x5654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + 2*r10 + 64] LONG $0x6f7ea1c4; WORD $0x565c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + 2*r10 + 96] LONG $0xd57da1c4; WORD $0x5704 // vpmullw ymm0, ymm0, ymmword ptr [rdi + 2*r10] LONG $0xd575a1c4; WORD $0x574c; BYTE $0x20 // vpmullw ymm1, ymm1, ymmword ptr [rdi + 2*r10 + 32] LONG $0xd56da1c4; WORD $0x5754; BYTE $0x40 // vpmullw ymm2, ymm2, ymmword ptr [rdi + 2*r10 + 64] LONG $0xd565a1c4; WORD $0x575c; BYTE $0x60 // vpmullw ymm3, ymm3, ymmword ptr [rdi + 2*r10 + 96] LONG $0x7f7ea1c4; WORD $0x5204 // vmovdqu ymmword ptr [rdx + 2*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x524c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 2*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x5254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 2*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x525c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 2*r10 + 96], ymm3 LONG $0x40c28349 // add r10, 64 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB12_8 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB12_18 LONG $0x08f98341 // cmp r9d, 8 JB LBB12_14 LBB12_11: WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x07e28341 // and r10d, 7 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xd3 // sub r11, r10 LBB12_12: LONG $0x6f7aa1c4; WORD $0x4e04 // vmovdqu xmm0, xmmword ptr [rsi + 2*r9] LONG $0xd579a1c4; WORD $0x4f04 // vpmullw xmm0, xmm0, xmmword ptr [rdi + 2*r9] LONG $0x7f7aa1c4; WORD $0x4a04 // vmovdqu xmmword ptr [rdx + 2*r9], xmm0 LONG $0x08c18349 // add r9, 8 WORD $0x394d; BYTE $0xcb // cmp r11, r9 JNE LBB12_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB12_14 JMP LBB12_18 TEXT ·_uint16_div(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp WORD $0x5741 // push r15 WORD $0x5641 // push r14 BYTE $0x53 // push rbx LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB13_12 WORD $0x8949; BYTE $0xd0 // mov r8, rdx WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x10fa8349 // cmp r10, 16 JAE LBB13_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB13_8 LBB13_3: WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x20f88348 // cmp rax, 32 JB LBB13_8 WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x20f88348 // cmp rax, 32 JB LBB13_8 WORD $0x8941; BYTE $0xce // mov r14d, ecx LONG $0x0fe68341 // and r14d, 15 WORD $0x894d; BYTE $0xd3 // mov r11, r10 WORD $0x294d; BYTE $0xf3 // sub r11, r14 WORD $0x3145; BYTE $0xff // xor r15d, r15d LBB13_6: LONG $0x6f7aa1c4; WORD $0x7f14 // vmovdqu xmm2, xmmword ptr [rdi + 2*r15] LONG $0xc2c5f9c5; BYTE $0x01 // vpextrw eax, xmm2, 1 LONG $0x6f7aa1c4; WORD $0x7f44; BYTE $0x10 // vmovdqu xmm0, xmmword ptr [rdi + 2*r15 + 16] LONG $0x6f7aa1c4; WORD $0x7e1c // vmovdqu xmm3, xmmword ptr [rsi + 2*r15] LONG $0xdbc5f9c5; BYTE $0x01 // vpextrw ebx, xmm3, 1 LONG $0x6f7aa1c4; WORD $0x7e4c; BYTE $0x10 // vmovdqu xmm1, xmmword ptr [rsi + 2*r15 + 16] WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0xd07ef9c5 // vmovd eax, xmm2 LONG $0xdb7ef9c5 // vmovd ebx, xmm3 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xe06ef9c5 // vmovd xmm4, eax LONG $0xc459c1c4; WORD $0x01e1 // vpinsrw xmm4, xmm4, r9d, 1 LONG $0xc2c5f9c5; BYTE $0x02 // vpextrw eax, xmm2, 2 LONG $0xdbc5f9c5; BYTE $0x02 // vpextrw ebx, xmm3, 2 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xe0c4d9c5; BYTE $0x02 // vpinsrw xmm4, xmm4, eax, 2 LONG $0xc2c5f9c5; BYTE $0x03 // vpextrw eax, xmm2, 3 LONG $0xdbc5f9c5; BYTE $0x03 // vpextrw ebx, xmm3, 3 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xe0c4d9c5; BYTE $0x03 // vpinsrw xmm4, xmm4, eax, 3 LONG $0xc2c5f9c5; BYTE $0x04 // vpextrw eax, xmm2, 4 LONG $0xdbc5f9c5; BYTE $0x04 // vpextrw ebx, xmm3, 4 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xe0c4d9c5; BYTE $0x04 // vpinsrw xmm4, xmm4, eax, 4 LONG $0xc2c5f9c5; BYTE $0x05 // vpextrw eax, xmm2, 5 LONG $0xdbc5f9c5; BYTE $0x05 // vpextrw ebx, xmm3, 5 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xe0c4d9c5; BYTE $0x05 // vpinsrw xmm4, xmm4, eax, 5 LONG $0xc2c5f9c5; BYTE $0x06 // vpextrw eax, xmm2, 6 LONG $0xdbc5f9c5; BYTE $0x06 // vpextrw ebx, xmm3, 6 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xe0c4d9c5; BYTE $0x06 // vpinsrw xmm4, xmm4, eax, 6 LONG $0xc2c5f9c5; BYTE $0x07 // vpextrw eax, xmm2, 7 LONG $0xdbc5f9c5; BYTE $0x07 // vpextrw ebx, xmm3, 7 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xd0c4d9c5; BYTE $0x07 // vpinsrw xmm2, xmm4, eax, 7 LONG $0xc0c5f9c5; BYTE $0x01 // vpextrw eax, xmm0, 1 LONG $0xd9c5f9c5; BYTE $0x01 // vpextrw ebx, xmm1, 1 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0xc07ef9c5 // vmovd eax, xmm0 LONG $0xcb7ef9c5 // vmovd ebx, xmm1 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xd86ef9c5 // vmovd xmm3, eax LONG $0xc461c1c4; WORD $0x01d9 // vpinsrw xmm3, xmm3, r9d, 1 LONG $0xc0c5f9c5; BYTE $0x02 // vpextrw eax, xmm0, 2 LONG $0xd9c5f9c5; BYTE $0x02 // vpextrw ebx, xmm1, 2 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xd8c4e1c5; BYTE $0x02 // vpinsrw xmm3, xmm3, eax, 2 LONG $0xc0c5f9c5; BYTE $0x03 // vpextrw eax, xmm0, 3 LONG $0xd9c5f9c5; BYTE $0x03 // vpextrw ebx, xmm1, 3 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xd8c4e1c5; BYTE $0x03 // vpinsrw xmm3, xmm3, eax, 3 LONG $0xc0c5f9c5; BYTE $0x04 // vpextrw eax, xmm0, 4 LONG $0xd9c5f9c5; BYTE $0x04 // vpextrw ebx, xmm1, 4 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xd8c4e1c5; BYTE $0x04 // vpinsrw xmm3, xmm3, eax, 4 LONG $0xc0c5f9c5; BYTE $0x05 // vpextrw eax, xmm0, 5 LONG $0xd9c5f9c5; BYTE $0x05 // vpextrw ebx, xmm1, 5 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xd8c4e1c5; BYTE $0x05 // vpinsrw xmm3, xmm3, eax, 5 LONG $0xc0c5f9c5; BYTE $0x06 // vpextrw eax, xmm0, 6 LONG $0xd9c5f9c5; BYTE $0x06 // vpextrw ebx, xmm1, 6 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xd8c4e1c5; BYTE $0x06 // vpinsrw xmm3, xmm3, eax, 6 LONG $0xc0c5f9c5; BYTE $0x07 // vpextrw eax, xmm0, 7 LONG $0xd9c5f9c5; BYTE $0x07 // vpextrw ebx, xmm1, 7 WORD $0xd231 // xor edx, edx WORD $0xf766; BYTE $0xf3 // div bx LONG $0xc0c4e1c5; BYTE $0x07 // vpinsrw xmm0, xmm3, eax, 7 LONG $0x7f7a81c4; WORD $0x7844; BYTE $0x10 // vmovdqu xmmword ptr [r8 + 2*r15 + 16], xmm0 LONG $0x7f7a81c4; WORD $0x7814 // vmovdqu xmmword ptr [r8 + 2*r15], xmm2 LONG $0x10c78349 // add r15, 16 WORD $0x394d; BYTE $0xfb // cmp r11, r15 JNE LBB13_6 WORD $0x854d; BYTE $0xf6 // test r14, r14 JE LBB13_12 LBB13_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d LONG $0x015b8d49 // lea rbx, [r11 + 1] WORD $0xc1f6; BYTE $0x01 // test cl, 1 JE LBB13_10 LONG $0x04b70f42; BYTE $0x5f // movzx eax, word ptr [rdi + 2*r11] WORD $0xd231 // xor edx, edx LONG $0x34f74266; BYTE $0x5e // div word ptr [rsi + 2*r11] LONG $0x04894366; BYTE $0x58 // mov word ptr [r8 + 2*r11], ax WORD $0x8949; BYTE $0xdb // mov r11, rbx LBB13_10: WORD $0x3949; BYTE $0xda // cmp r10, rbx JE LBB13_12 LBB13_11: LONG $0x04b70f42; BYTE $0x5f // movzx eax, word ptr [rdi + 2*r11] WORD $0xd231 // xor edx, edx LONG $0x34f74266; BYTE $0x5e // div word ptr [rsi + 2*r11] LONG $0x04894366; BYTE $0x58 // mov word ptr [r8 + 2*r11], ax LONG $0x44b70f42; WORD $0x025f // movzx eax, word ptr [rdi + 2*r11 + 2] WORD $0xd231 // xor edx, edx LONG $0x74f74266; WORD $0x025e // div word ptr [rsi + 2*r11 + 2] LONG $0x44894366; WORD $0x0258 // mov word ptr [r8 + 2*r11 + 2], ax LONG $0x02c38349 // add r11, 2 WORD $0x394d; BYTE $0xda // cmp r10, r11 JNE LBB13_11 LBB13_12: LONG $0xe8658d48 // lea rsp, [rbp - 24] BYTE $0x5b // pop rbx WORD $0x5e41 // pop r14 WORD $0x5f41 // pop r15 BYTE $0x5d // pop rbp BYTE $0xc3 // ret TEXT ·_uint32_sum(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xd285 // test edx, edx JLE LBB14_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x20f88349 // cmp r8, 32 JAE LBB14_4 WORD $0xc931 // xor ecx, ecx WORD $0xc031 // xor eax, eax JMP LBB14_7 LBB14_1: WORD $0xc031 // xor eax, eax JMP LBB14_8 LBB14_4: WORD $0xe283; BYTE $0x1f // and edx, 31 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 LBB14_5: LONG $0x04fefdc5; BYTE $0x87 // vpaddd ymm0, ymm0, ymmword ptr [rdi + 4*rax] LONG $0x4cfef5c5; WORD $0x2087 // vpaddd ymm1, ymm1, ymmword ptr [rdi + 4*rax + 32] LONG $0x54feedc5; WORD $0x4087 // vpaddd ymm2, ymm2, ymmword ptr [rdi + 4*rax + 64] LONG $0x5cfee5c5; WORD $0x6087 // vpaddd ymm3, ymm3, ymmword ptr [rdi + 4*rax + 96] LONG $0x20c08348 // add rax, 32 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB14_5 LONG $0xc0fef5c5 // vpaddd ymm0, ymm1, ymm0 LONG $0xc0feedc5 // vpaddd ymm0, ymm2, ymm0 LONG $0xc0fee5c5 // vpaddd ymm0, ymm3, ymm0 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1fef9c5 // vpaddd xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1fef9c5 // vpaddd xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x55 // vpshufd xmm1, xmm0, 85 LONG $0xc1fef9c5 // vpaddd xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB14_8 LBB14_7: WORD $0x0403; BYTE $0x8f // add eax, dword ptr [rdi + 4*rcx] WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB14_7 LBB14_8: WORD $0x0689 // mov dword ptr [rsi], eax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint32_min(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0x078b // mov eax, dword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB15_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x20f88349 // cmp r8, 32 JAE LBB15_3 WORD $0xc931 // xor ecx, ecx JMP LBB15_6 LBB15_3: WORD $0xe283; BYTE $0x1f // and edx, 31 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB15_4: LONG $0x3b7de2c4; WORD $0x8704 // vpminud ymm0, ymm0, ymmword ptr [rdi + 4*rax] LONG $0x3b75e2c4; WORD $0x874c; BYTE $0x20 // vpminud ymm1, ymm1, ymmword ptr [rdi + 4*rax + 32] LONG $0x3b6de2c4; WORD $0x8754; BYTE $0x40 // vpminud ymm2, ymm2, ymmword ptr [rdi + 4*rax + 64] LONG $0x3b65e2c4; WORD $0x875c; BYTE $0x60 // vpminud ymm3, ymm3, ymmword ptr [rdi + 4*rax + 96] LONG $0x20c08348 // add rax, 32 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB15_4 LONG $0x3b7de2c4; BYTE $0xc1 // vpminud ymm0, ymm0, ymm1 LONG $0x3b7de2c4; BYTE $0xc2 // vpminud ymm0, ymm0, ymm2 LONG $0x3b7de2c4; BYTE $0xc3 // vpminud ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x55 // vpshufd xmm1, xmm0, 85 LONG $0x3b79e2c4; BYTE $0xc1 // vpminud xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB15_7 LBB15_6: WORD $0x148b; BYTE $0x8f // mov edx, dword ptr [rdi + 4*rcx] WORD $0xc239 // cmp edx, eax WORD $0x420f; BYTE $0xc2 // cmovb eax, edx WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB15_6 LBB15_7: WORD $0x0689 // mov dword ptr [rsi], eax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint32_max(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0x078b // mov eax, dword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB16_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x20f88349 // cmp r8, 32 JAE LBB16_3 WORD $0xc931 // xor ecx, ecx JMP LBB16_6 LBB16_3: WORD $0xe283; BYTE $0x1f // and edx, 31 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB16_4: LONG $0x3f7de2c4; WORD $0x8704 // vpmaxud ymm0, ymm0, ymmword ptr [rdi + 4*rax] LONG $0x3f75e2c4; WORD $0x874c; BYTE $0x20 // vpmaxud ymm1, ymm1, ymmword ptr [rdi + 4*rax + 32] LONG $0x3f6de2c4; WORD $0x8754; BYTE $0x40 // vpmaxud ymm2, ymm2, ymmword ptr [rdi + 4*rax + 64] LONG $0x3f65e2c4; WORD $0x875c; BYTE $0x60 // vpmaxud ymm3, ymm3, ymmword ptr [rdi + 4*rax + 96] LONG $0x20c08348 // add rax, 32 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB16_4 LONG $0x3f7de2c4; BYTE $0xc1 // vpmaxud ymm0, ymm0, ymm1 LONG $0x3f7de2c4; BYTE $0xc2 // vpmaxud ymm0, ymm0, ymm2 LONG $0x3f7de2c4; BYTE $0xc3 // vpmaxud ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0x3f79e2c4; BYTE $0xc1 // vpmaxud xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0x3f79e2c4; BYTE $0xc1 // vpmaxud xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x55 // vpshufd xmm1, xmm0, 85 LONG $0x3f79e2c4; BYTE $0xc1 // vpmaxud xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB16_7 LBB16_6: WORD $0x148b; BYTE $0x8f // mov edx, dword ptr [rdi + 4*rcx] WORD $0xc239 // cmp edx, eax WORD $0x470f; BYTE $0xc2 // cmova eax, edx WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB16_6 LBB16_7: WORD $0x0689 // mov dword ptr [rsi], eax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint32_add(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB17_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x20f88349 // cmp r8, 32 JAE LBB17_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB17_8 LBB17_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB17_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB17_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x1fe18341 // and r9d, 31 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB17_6: LONG $0x6f7ea1c4; WORD $0x9604 // vmovdqu ymm0, ymmword ptr [rsi + 4*r10] LONG $0x6f7ea1c4; WORD $0x964c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + 4*r10 + 32] LONG $0x6f7ea1c4; WORD $0x9654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + 4*r10 + 64] LONG $0x6f7ea1c4; WORD $0x965c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + 4*r10 + 96] LONG $0xfe7da1c4; WORD $0x9704 // vpaddd ymm0, ymm0, ymmword ptr [rdi + 4*r10] LONG $0xfe75a1c4; WORD $0x974c; BYTE $0x20 // vpaddd ymm1, ymm1, ymmword ptr [rdi + 4*r10 + 32] LONG $0xfe6da1c4; WORD $0x9754; BYTE $0x40 // vpaddd ymm2, ymm2, ymmword ptr [rdi + 4*r10 + 64] LONG $0xfe65a1c4; WORD $0x975c; BYTE $0x60 // vpaddd ymm3, ymm3, ymmword ptr [rdi + 4*r10 + 96] LONG $0x7f7ea1c4; WORD $0x9204 // vmovdqu ymmword ptr [rdx + 4*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x924c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 4*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x9254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 4*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x925c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 4*r10 + 96], ymm3 LONG $0x20c28349 // add r10, 32 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB17_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB17_12 LBB17_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB17_10 LBB17_9: LONG $0x9e048b42 // mov eax, dword ptr [rsi + 4*r11] LONG $0x9f040342 // add eax, dword ptr [rdi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB17_9 LBB17_10: LONG $0x03f98349 // cmp r9, 3 JB LBB17_12 LBB17_11: LONG $0x9e048b42 // mov eax, dword ptr [rsi + 4*r11] LONG $0x9f040342 // add eax, dword ptr [rdi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax LONG $0x9e448b42; BYTE $0x04 // mov eax, dword ptr [rsi + 4*r11 + 4] LONG $0x9f440342; BYTE $0x04 // add eax, dword ptr [rdi + 4*r11 + 4] LONG $0x9a448942; BYTE $0x04 // mov dword ptr [rdx + 4*r11 + 4], eax LONG $0x9e448b42; BYTE $0x08 // mov eax, dword ptr [rsi + 4*r11 + 8] LONG $0x9f440342; BYTE $0x08 // add eax, dword ptr [rdi + 4*r11 + 8] LONG $0x9a448942; BYTE $0x08 // mov dword ptr [rdx + 4*r11 + 8], eax LONG $0x9e448b42; BYTE $0x0c // mov eax, dword ptr [rsi + 4*r11 + 12] LONG $0x9f440342; BYTE $0x0c // add eax, dword ptr [rdi + 4*r11 + 12] LONG $0x9a448942; BYTE $0x0c // mov dword ptr [rdx + 4*r11 + 12], eax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB17_11 LBB17_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint32_sub(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB18_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x20f88349 // cmp r8, 32 JAE LBB18_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB18_8 LBB18_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB18_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB18_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x1fe18341 // and r9d, 31 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB18_6: LONG $0x6f7ea1c4; WORD $0x9704 // vmovdqu ymm0, ymmword ptr [rdi + 4*r10] LONG $0x6f7ea1c4; WORD $0x974c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rdi + 4*r10 + 32] LONG $0x6f7ea1c4; WORD $0x9754; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rdi + 4*r10 + 64] LONG $0x6f7ea1c4; WORD $0x975c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rdi + 4*r10 + 96] LONG $0xfa7da1c4; WORD $0x9604 // vpsubd ymm0, ymm0, ymmword ptr [rsi + 4*r10] LONG $0xfa75a1c4; WORD $0x964c; BYTE $0x20 // vpsubd ymm1, ymm1, ymmword ptr [rsi + 4*r10 + 32] LONG $0xfa6da1c4; WORD $0x9654; BYTE $0x40 // vpsubd ymm2, ymm2, ymmword ptr [rsi + 4*r10 + 64] LONG $0xfa65a1c4; WORD $0x965c; BYTE $0x60 // vpsubd ymm3, ymm3, ymmword ptr [rsi + 4*r10 + 96] LONG $0x7f7ea1c4; WORD $0x9204 // vmovdqu ymmword ptr [rdx + 4*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x924c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 4*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x9254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 4*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x925c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 4*r10 + 96], ymm3 LONG $0x20c28349 // add r10, 32 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB18_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB18_12 LBB18_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB18_10 LBB18_9: LONG $0x9f048b42 // mov eax, dword ptr [rdi + 4*r11] LONG $0x9e042b42 // sub eax, dword ptr [rsi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB18_9 LBB18_10: LONG $0x03f98349 // cmp r9, 3 JB LBB18_12 LBB18_11: LONG $0x9f048b42 // mov eax, dword ptr [rdi + 4*r11] LONG $0x9e042b42 // sub eax, dword ptr [rsi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax LONG $0x9f448b42; BYTE $0x04 // mov eax, dword ptr [rdi + 4*r11 + 4] LONG $0x9e442b42; BYTE $0x04 // sub eax, dword ptr [rsi + 4*r11 + 4] LONG $0x9a448942; BYTE $0x04 // mov dword ptr [rdx + 4*r11 + 4], eax LONG $0x9f448b42; BYTE $0x08 // mov eax, dword ptr [rdi + 4*r11 + 8] LONG $0x9e442b42; BYTE $0x08 // sub eax, dword ptr [rsi + 4*r11 + 8] LONG $0x9a448942; BYTE $0x08 // mov dword ptr [rdx + 4*r11 + 8], eax LONG $0x9f448b42; BYTE $0x0c // mov eax, dword ptr [rdi + 4*r11 + 12] LONG $0x9e442b42; BYTE $0x0c // sub eax, dword ptr [rsi + 4*r11 + 12] LONG $0x9a448942; BYTE $0x0c // mov dword ptr [rdx + 4*r11 + 12], eax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB18_11 LBB18_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint32_mul(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB19_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x20f88349 // cmp r8, 32 JAE LBB19_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB19_8 LBB19_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB19_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB19_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x1fe18341 // and r9d, 31 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB19_6: LONG $0x6f7ea1c4; WORD $0x9604 // vmovdqu ymm0, ymmword ptr [rsi + 4*r10] LONG $0x6f7ea1c4; WORD $0x964c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + 4*r10 + 32] LONG $0x6f7ea1c4; WORD $0x9654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + 4*r10 + 64] LONG $0x6f7ea1c4; WORD $0x965c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + 4*r10 + 96] LONG $0x407da2c4; WORD $0x9704 // vpmulld ymm0, ymm0, ymmword ptr [rdi + 4*r10] LONG $0x4075a2c4; WORD $0x974c; BYTE $0x20 // vpmulld ymm1, ymm1, ymmword ptr [rdi + 4*r10 + 32] LONG $0x406da2c4; WORD $0x9754; BYTE $0x40 // vpmulld ymm2, ymm2, ymmword ptr [rdi + 4*r10 + 64] LONG $0x4065a2c4; WORD $0x975c; BYTE $0x60 // vpmulld ymm3, ymm3, ymmword ptr [rdi + 4*r10 + 96] LONG $0x7f7ea1c4; WORD $0x9204 // vmovdqu ymmword ptr [rdx + 4*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x924c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 4*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x9254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 4*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x925c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 4*r10 + 96], ymm3 LONG $0x20c28349 // add r10, 32 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB19_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB19_12 LBB19_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB19_10 LBB19_9: LONG $0x9e048b42 // mov eax, dword ptr [rsi + 4*r11] LONG $0x04af0f42; BYTE $0x9f // imul eax, dword ptr [rdi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB19_9 LBB19_10: LONG $0x03f98349 // cmp r9, 3 JB LBB19_12 LBB19_11: LONG $0x9e048b42 // mov eax, dword ptr [rsi + 4*r11] LONG $0x04af0f42; BYTE $0x9f // imul eax, dword ptr [rdi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax LONG $0x9e448b42; BYTE $0x04 // mov eax, dword ptr [rsi + 4*r11 + 4] LONG $0x44af0f42; WORD $0x049f // imul eax, dword ptr [rdi + 4*r11 + 4] LONG $0x9a448942; BYTE $0x04 // mov dword ptr [rdx + 4*r11 + 4], eax LONG $0x9e448b42; BYTE $0x08 // mov eax, dword ptr [rsi + 4*r11 + 8] LONG $0x44af0f42; WORD $0x089f // imul eax, dword ptr [rdi + 4*r11 + 8] LONG $0x9a448942; BYTE $0x08 // mov dword ptr [rdx + 4*r11 + 8], eax LONG $0x9e448b42; BYTE $0x0c // mov eax, dword ptr [rsi + 4*r11 + 12] LONG $0x44af0f42; WORD $0x0c9f // imul eax, dword ptr [rdi + 4*r11 + 12] LONG $0x9a448942; BYTE $0x0c // mov dword ptr [rdx + 4*r11 + 12], eax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB19_11 LBB19_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint32_div(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp WORD $0x5741 // push r15 WORD $0x5641 // push r14 BYTE $0x53 // push rbx LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB20_12 WORD $0x8949; BYTE $0xd0 // mov r8, rdx WORD $0x8941; BYTE $0xcb // mov r11d, ecx LONG $0x08fb8349 // cmp r11, 8 JAE LBB20_3 WORD $0x3145; BYTE $0xff // xor r15d, r15d JMP LBB20_8 LBB20_3: WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xff // xor r15d, r15d LONG $0x20f88348 // cmp rax, 32 JB LBB20_8 WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x20f88348 // cmp rax, 32 JB LBB20_8 WORD $0x8941; BYTE $0xce // mov r14d, ecx LONG $0x07e68341 // and r14d, 7 WORD $0x894d; BYTE $0xdf // mov r15, r11 WORD $0x294d; BYTE $0xf7 // sub r15, r14 WORD $0xdb31 // xor ebx, ebx LBB20_6: LONG $0x9f0c8b44 // mov r9d, dword ptr [rdi + 4*rbx] LONG $0x049f448b // mov eax, dword ptr [rdi + 4*rbx + 4] WORD $0xd231 // xor edx, edx LONG $0x049e74f7 // div dword ptr [rsi + 4*rbx + 4] WORD $0x8941; BYTE $0xc2 // mov r10d, eax WORD $0x8944; BYTE $0xc8 // mov eax, r9d WORD $0xd231 // xor edx, edx WORD $0x34f7; BYTE $0x9e // div dword ptr [rsi + 4*rbx] LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x2279c3c4; WORD $0x01c2 // vpinsrd xmm0, xmm0, r10d, 1 LONG $0x089f448b // mov eax, dword ptr [rdi + 4*rbx + 8] WORD $0xd231 // xor edx, edx LONG $0x089e74f7 // div dword ptr [rsi + 4*rbx + 8] LONG $0x2279e3c4; WORD $0x02c0 // vpinsrd xmm0, xmm0, eax, 2 LONG $0x0c9f448b // mov eax, dword ptr [rdi + 4*rbx + 12] WORD $0xd231 // xor edx, edx LONG $0x0c9e74f7 // div dword ptr [rsi + 4*rbx + 12] LONG $0x2279e3c4; WORD $0x03c0 // vpinsrd xmm0, xmm0, eax, 3 LONG $0x149f448b // mov eax, dword ptr [rdi + 4*rbx + 20] WORD $0xd231 // xor edx, edx LONG $0x149e74f7 // div dword ptr [rsi + 4*rbx + 20] WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0x109f448b // mov eax, dword ptr [rdi + 4*rbx + 16] WORD $0xd231 // xor edx, edx LONG $0x109e74f7 // div dword ptr [rsi + 4*rbx + 16] LONG $0xc86ef9c5 // vmovd xmm1, eax LONG $0x2271c3c4; WORD $0x01c9 // vpinsrd xmm1, xmm1, r9d, 1 LONG $0x189f448b // mov eax, dword ptr [rdi + 4*rbx + 24] WORD $0xd231 // xor edx, edx LONG $0x189e74f7 // div dword ptr [rsi + 4*rbx + 24] LONG $0x2271e3c4; WORD $0x02c8 // vpinsrd xmm1, xmm1, eax, 2 LONG $0x1c9f448b // mov eax, dword ptr [rdi + 4*rbx + 28] WORD $0xd231 // xor edx, edx LONG $0x1c9e74f7 // div dword ptr [rsi + 4*rbx + 28] LONG $0x2271e3c4; WORD $0x03c8 // vpinsrd xmm1, xmm1, eax, 3 LONG $0x7f7ac1c4; WORD $0x984c; BYTE $0x10 // vmovdqu xmmword ptr [r8 + 4*rbx + 16], xmm1 LONG $0x7f7ac1c4; WORD $0x9804 // vmovdqu xmmword ptr [r8 + 4*rbx], xmm0 LONG $0x08c38348 // add rbx, 8 WORD $0x3949; BYTE $0xdf // cmp r15, rbx JNE LBB20_6 WORD $0x854d; BYTE $0xf6 // test r14, r14 JE LBB20_12 LBB20_8: WORD $0x2944; BYTE $0xf9 // sub ecx, r15d LONG $0x015f8d49 // lea rbx, [r15 + 1] WORD $0xc1f6; BYTE $0x01 // test cl, 1 JE LBB20_10 LONG $0xbf048b42 // mov eax, dword ptr [rdi + 4*r15] WORD $0xd231 // xor edx, edx LONG $0xbe34f742 // div dword ptr [rsi + 4*r15] LONG $0xb8048943 // mov dword ptr [r8 + 4*r15], eax WORD $0x8949; BYTE $0xdf // mov r15, rbx LBB20_10: WORD $0x3949; BYTE $0xdb // cmp r11, rbx JE LBB20_12 LBB20_11: LONG $0xbf048b42 // mov eax, dword ptr [rdi + 4*r15] WORD $0xd231 // xor edx, edx LONG $0xbe34f742 // div dword ptr [rsi + 4*r15] LONG $0xb8048943 // mov dword ptr [r8 + 4*r15], eax LONG $0xbf448b42; BYTE $0x04 // mov eax, dword ptr [rdi + 4*r15 + 4] WORD $0xd231 // xor edx, edx LONG $0xbe74f742; BYTE $0x04 // div dword ptr [rsi + 4*r15 + 4] LONG $0xb8448943; BYTE $0x04 // mov dword ptr [r8 + 4*r15 + 4], eax LONG $0x02c78349 // add r15, 2 WORD $0x394d; BYTE $0xfb // cmp r11, r15 JNE LBB20_11 LBB20_12: LONG $0xe8658d48 // lea rsp, [rbp - 24] BYTE $0x5b // pop rbx WORD $0x5e41 // pop r14 WORD $0x5f41 // pop r15 BYTE $0x5d // pop rbp BYTE $0xc3 // ret TEXT ·_uint64_sum(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xd285 // test edx, edx JLE LBB21_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB21_4 WORD $0xc931 // xor ecx, ecx WORD $0xc031 // xor eax, eax JMP LBB21_7 LBB21_1: WORD $0xc031 // xor eax, eax JMP LBB21_8 LBB21_4: WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 LBB21_5: LONG $0x04d4fdc5; BYTE $0xc7 // vpaddq ymm0, ymm0, ymmword ptr [rdi + 8*rax] LONG $0x4cd4f5c5; WORD $0x20c7 // vpaddq ymm1, ymm1, ymmword ptr [rdi + 8*rax + 32] LONG $0x54d4edc5; WORD $0x40c7 // vpaddq ymm2, ymm2, ymmword ptr [rdi + 8*rax + 64] LONG $0x5cd4e5c5; WORD $0x60c7 // vpaddq ymm3, ymm3, ymmword ptr [rdi + 8*rax + 96] LONG $0x10c08348 // add rax, 16 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB21_5 LONG $0xc0d4f5c5 // vpaddq ymm0, ymm1, ymm0 LONG $0xc0d4edc5 // vpaddq ymm0, ymm2, ymm0 LONG $0xc0d4e5c5 // vpaddq ymm0, ymm3, ymm0 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1d4f9c5 // vpaddq xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1d4f9c5 // vpaddq xmm0, xmm0, xmm1 LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB21_8 LBB21_7: LONG $0xcf040348 // add rax, qword ptr [rdi + 8*rcx] WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB21_7 LBB21_8: WORD $0x8948; BYTE $0x06 // mov qword ptr [rsi], rax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LCPI22_0: // TEXT ·_uint64_min(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0x8b48; BYTE $0x07 // mov rax, qword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB22_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB22_3 WORD $0xc931 // xor ecx, ecx JMP LBB22_6 LBB22_3: WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax LONG $0x597de2c4; BYTE $0xc8 // vpbroadcastq ymm1, xmm0 QUAD $0x00010d05597de2c4; BYTE $0x00 // vpbroadcastq ymm0, qword ptr [rip + .LCPI22_0] WORD $0xc031 // xor eax, eax LONG $0xe16ffdc5 // vmovdqa ymm4, ymm1 LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1 LONG $0xd16ffdc5 // vmovdqa ymm2, ymm1 LBB22_4: LONG $0x2c6ffec5; BYTE $0xc7 // vmovdqu ymm5, ymmword ptr [rdi + 8*rax] LONG $0x746ffec5; WORD $0x20c7 // vmovdqu ymm6, ymmword ptr [rdi + 8*rax + 32] LONG $0x7c6ffec5; WORD $0x40c7 // vmovdqu ymm7, ymmword ptr [rdi + 8*rax + 64] LONG $0xc0ef75c5 // vpxor ymm8, ymm1, ymm0 LONG $0xc8ef55c5 // vpxor ymm9, ymm5, ymm0 LONG $0x373d42c4; BYTE $0xc1 // vpcmpgtq ymm8, ymm8, ymm9 LONG $0x4b75e3c4; WORD $0x80cd // vblendvpd ymm1, ymm1, ymm5, ymm8 LONG $0xe8efddc5 // vpxor ymm5, ymm4, ymm0 LONG $0xc0ef4dc5 // vpxor ymm8, ymm6, ymm0 LONG $0x3755c2c4; BYTE $0xe8 // vpcmpgtq ymm5, ymm5, ymm8 LONG $0x4b5de3c4; WORD $0x50e6 // vblendvpd ymm4, ymm4, ymm6, ymm5 LONG $0xe8efe5c5 // vpxor ymm5, ymm3, ymm0 LONG $0xf0efc5c5 // vpxor ymm6, ymm7, ymm0 LONG $0x3755e2c4; BYTE $0xee // vpcmpgtq ymm5, ymm5, ymm6 LONG $0x4b65e3c4; WORD $0x50df // vblendvpd ymm3, ymm3, ymm7, ymm5 LONG $0x6c6ffec5; WORD $0x60c7 // vmovdqu ymm5, ymmword ptr [rdi + 8*rax + 96] LONG $0xf0efedc5 // vpxor ymm6, ymm2, ymm0 LONG $0xf8efd5c5 // vpxor ymm7, ymm5, ymm0 LONG $0x374de2c4; BYTE $0xf7 // vpcmpgtq ymm6, ymm6, ymm7 LONG $0x4b6de3c4; WORD $0x60d5 // vblendvpd ymm2, ymm2, ymm5, ymm6 LONG $0x10c08348 // add rax, 16 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB22_4 LONG $0xe8eff5c5 // vpxor ymm5, ymm1, ymm0 LONG $0xf0efddc5 // vpxor ymm6, ymm4, ymm0 LONG $0x374de2c4; BYTE $0xed // vpcmpgtq ymm5, ymm6, ymm5 LONG $0x4b5de3c4; WORD $0x50c9 // vblendvpd ymm1, ymm4, ymm1, ymm5 LONG $0xe057f5c5 // vxorpd ymm4, ymm1, ymm0 LONG $0xe8efe5c5 // vpxor ymm5, ymm3, ymm0 LONG $0x3755e2c4; BYTE $0xe4 // vpcmpgtq ymm4, ymm5, ymm4 LONG $0x4b65e3c4; WORD $0x40c9 // vblendvpd ymm1, ymm3, ymm1, ymm4 LONG $0xd857f5c5 // vxorpd ymm3, ymm1, ymm0 LONG $0xe0efedc5 // vpxor ymm4, ymm2, ymm0 LONG $0x375de2c4; BYTE $0xdb // vpcmpgtq ymm3, ymm4, ymm3 LONG $0x4b6de3c4; WORD $0x30c9 // vblendvpd ymm1, ymm2, ymm1, ymm3 LONG $0x197de3c4; WORD $0x01ca // vextractf128 xmm2, ymm1, 1 LONG $0xd857f1c5 // vxorpd xmm3, xmm1, xmm0 LONG $0xe057e9c5 // vxorpd xmm4, xmm2, xmm0 LONG $0x3759e2c4; BYTE $0xdb // vpcmpgtq xmm3, xmm4, xmm3 LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd xmm1, xmm2, xmm1, xmm3 LONG $0x0479e3c4; WORD $0xeed1 // vpermilps xmm2, xmm1, 238 LONG $0xd857f1c5 // vxorpd xmm3, xmm1, xmm0 LONG $0xc057e9c5 // vxorpd xmm0, xmm2, xmm0 LONG $0x3779e2c4; BYTE $0xc3 // vpcmpgtq xmm0, xmm0, xmm3 LONG $0x4b69e3c4; WORD $0x00c1 // vblendvpd xmm0, xmm2, xmm1, xmm0 LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB22_7 LBB22_6: LONG $0xcf148b48 // mov rdx, qword ptr [rdi + 8*rcx] WORD $0x3948; BYTE $0xc2 // cmp rdx, rax LONG $0xc2420f48 // cmovb rax, rdx WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB22_6 LBB22_7: WORD $0x8948; BYTE $0x06 // mov qword ptr [rsi], rax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LCPI23_0: // QUAD $0x8000000000000000 TEXT ·_uint64_max(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0x8b48; BYTE $0x07 // mov rax, qword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB23_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB23_3 WORD $0xc931 // xor ecx, ecx JMP LBB23_6 LBB23_3: WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax LONG $0x597de2c4; BYTE $0xc8 // vpbroadcastq ymm1, xmm0 QUAD $0x00010d05597de2c4; BYTE $0x00 // vpbroadcastq ymm0, qword ptr [rip + .LCPI23_0] WORD $0xc031 // xor eax, eax LONG $0xe16ffdc5 // vmovdqa ymm4, ymm1 LONG $0xd96ffdc5 // vmovdqa ymm3, ymm1 LONG $0xd16ffdc5 // vmovdqa ymm2, ymm1 LBB23_4: LONG $0x2c6ffec5; BYTE $0xc7 // vmovdqu ymm5, ymmword ptr [rdi + 8*rax] LONG $0x746ffec5; WORD $0x20c7 // vmovdqu ymm6, ymmword ptr [rdi + 8*rax + 32] LONG $0x7c6ffec5; WORD $0x40c7 // vmovdqu ymm7, ymmword ptr [rdi + 8*rax + 64] LONG $0xc0ef75c5 // vpxor ymm8, ymm1, ymm0 LONG $0xc8ef55c5 // vpxor ymm9, ymm5, ymm0 LONG $0x373542c4; BYTE $0xc0 // vpcmpgtq ymm8, ymm9, ymm8 LONG $0x4b75e3c4; WORD $0x80cd // vblendvpd ymm1, ymm1, ymm5, ymm8 LONG $0xe8efddc5 // vpxor ymm5, ymm4, ymm0 LONG $0xc0ef4dc5 // vpxor ymm8, ymm6, ymm0 LONG $0x373de2c4; BYTE $0xed // vpcmpgtq ymm5, ymm8, ymm5 LONG $0x4b5de3c4; WORD $0x50e6 // vblendvpd ymm4, ymm4, ymm6, ymm5 LONG $0xe8efe5c5 // vpxor ymm5, ymm3, ymm0 LONG $0xf0efc5c5 // vpxor ymm6, ymm7, ymm0 LONG $0x374de2c4; BYTE $0xed // vpcmpgtq ymm5, ymm6, ymm5 LONG $0x4b65e3c4; WORD $0x50df // vblendvpd ymm3, ymm3, ymm7, ymm5 LONG $0x6c6ffec5; WORD $0x60c7 // vmovdqu ymm5, ymmword ptr [rdi + 8*rax + 96] LONG $0xf0efedc5 // vpxor ymm6, ymm2, ymm0 LONG $0xf8efd5c5 // vpxor ymm7, ymm5, ymm0 LONG $0x3745e2c4; BYTE $0xf6 // vpcmpgtq ymm6, ymm7, ymm6 LONG $0x4b6de3c4; WORD $0x60d5 // vblendvpd ymm2, ymm2, ymm5, ymm6 LONG $0x10c08348 // add rax, 16 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB23_4 LONG $0xe8efddc5 // vpxor ymm5, ymm4, ymm0 LONG $0xf0eff5c5 // vpxor ymm6, ymm1, ymm0 LONG $0x374de2c4; BYTE $0xed // vpcmpgtq ymm5, ymm6, ymm5 LONG $0x4b5de3c4; WORD $0x50c9 // vblendvpd ymm1, ymm4, ymm1, ymm5 LONG $0xe057f5c5 // vxorpd ymm4, ymm1, ymm0 LONG $0xe8efe5c5 // vpxor ymm5, ymm3, ymm0 LONG $0x375de2c4; BYTE $0xe5 // vpcmpgtq ymm4, ymm4, ymm5 LONG $0x4b65e3c4; WORD $0x40c9 // vblendvpd ymm1, ymm3, ymm1, ymm4 LONG $0xd857f5c5 // vxorpd ymm3, ymm1, ymm0 LONG $0xe0efedc5 // vpxor ymm4, ymm2, ymm0 LONG $0x3765e2c4; BYTE $0xdc // vpcmpgtq ymm3, ymm3, ymm4 LONG $0x4b6de3c4; WORD $0x30c9 // vblendvpd ymm1, ymm2, ymm1, ymm3 LONG $0x197de3c4; WORD $0x01ca // vextractf128 xmm2, ymm1, 1 LONG $0xd857e9c5 // vxorpd xmm3, xmm2, xmm0 LONG $0xe057f1c5 // vxorpd xmm4, xmm1, xmm0 LONG $0x3759e2c4; BYTE $0xdb // vpcmpgtq xmm3, xmm4, xmm3 LONG $0x4b69e3c4; WORD $0x30c9 // vblendvpd xmm1, xmm2, xmm1, xmm3 LONG $0x0479e3c4; WORD $0xeed1 // vpermilps xmm2, xmm1, 238 LONG $0xd857f1c5 // vxorpd xmm3, xmm1, xmm0 LONG $0xc057e9c5 // vxorpd xmm0, xmm2, xmm0 LONG $0x3761e2c4; BYTE $0xc0 // vpcmpgtq xmm0, xmm3, xmm0 LONG $0x4b69e3c4; WORD $0x00c1 // vblendvpd xmm0, xmm2, xmm1, xmm0 LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB23_7 LBB23_6: LONG $0xcf148b48 // mov rdx, qword ptr [rdi + 8*rcx] WORD $0x3948; BYTE $0xc2 // cmp rdx, rax LONG $0xc2470f48 // cmova rax, rdx WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB23_6 LBB23_7: WORD $0x8948; BYTE $0x06 // mov qword ptr [rsi], rax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret QUAD $0x8000000000000000 TEXT ·_uint64_add(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB24_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB24_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB24_8 LBB24_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB24_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB24_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x0fe18341 // and r9d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB24_6: LONG $0x6f7ea1c4; WORD $0xd604 // vmovdqu ymm0, ymmword ptr [rsi + 8*r10] LONG $0x6f7ea1c4; WORD $0xd64c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + 8*r10 + 32] LONG $0x6f7ea1c4; WORD $0xd654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + 8*r10 + 64] LONG $0x6f7ea1c4; WORD $0xd65c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + 8*r10 + 96] LONG $0xd47da1c4; WORD $0xd704 // vpaddq ymm0, ymm0, ymmword ptr [rdi + 8*r10] LONG $0xd475a1c4; WORD $0xd74c; BYTE $0x20 // vpaddq ymm1, ymm1, ymmword ptr [rdi + 8*r10 + 32] LONG $0xd46da1c4; WORD $0xd754; BYTE $0x40 // vpaddq ymm2, ymm2, ymmword ptr [rdi + 8*r10 + 64] LONG $0xd465a1c4; WORD $0xd75c; BYTE $0x60 // vpaddq ymm3, ymm3, ymmword ptr [rdi + 8*r10 + 96] LONG $0x7f7ea1c4; WORD $0xd204 // vmovdqu ymmword ptr [rdx + 8*r10], ymm0 LONG $0x7f7ea1c4; WORD $0xd24c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 8*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0xd254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 8*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0xd25c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 8*r10 + 96], ymm3 LONG $0x10c28349 // add r10, 16 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB24_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB24_12 LBB24_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB24_10 LBB24_9: LONG $0xde048b4a // mov rax, qword ptr [rsi + 8*r11] LONG $0xdf04034a // add rax, qword ptr [rdi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB24_9 LBB24_10: LONG $0x03f98349 // cmp r9, 3 JB LBB24_12 LBB24_11: LONG $0xde048b4a // mov rax, qword ptr [rsi + 8*r11] LONG $0xdf04034a // add rax, qword ptr [rdi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax LONG $0xde448b4a; BYTE $0x08 // mov rax, qword ptr [rsi + 8*r11 + 8] LONG $0xdf44034a; BYTE $0x08 // add rax, qword ptr [rdi + 8*r11 + 8] LONG $0xda44894a; BYTE $0x08 // mov qword ptr [rdx + 8*r11 + 8], rax LONG $0xde448b4a; BYTE $0x10 // mov rax, qword ptr [rsi + 8*r11 + 16] LONG $0xdf44034a; BYTE $0x10 // add rax, qword ptr [rdi + 8*r11 + 16] LONG $0xda44894a; BYTE $0x10 // mov qword ptr [rdx + 8*r11 + 16], rax LONG $0xde448b4a; BYTE $0x18 // mov rax, qword ptr [rsi + 8*r11 + 24] LONG $0xdf44034a; BYTE $0x18 // add rax, qword ptr [rdi + 8*r11 + 24] LONG $0xda44894a; BYTE $0x18 // mov qword ptr [rdx + 8*r11 + 24], rax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB24_11 LBB24_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint64_sub(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB25_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB25_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB25_8 LBB25_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB25_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB25_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x0fe18341 // and r9d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB25_6: LONG $0x6f7ea1c4; WORD $0xd704 // vmovdqu ymm0, ymmword ptr [rdi + 8*r10] LONG $0x6f7ea1c4; WORD $0xd74c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rdi + 8*r10 + 32] LONG $0x6f7ea1c4; WORD $0xd754; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rdi + 8*r10 + 64] LONG $0x6f7ea1c4; WORD $0xd75c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rdi + 8*r10 + 96] LONG $0xfb7da1c4; WORD $0xd604 // vpsubq ymm0, ymm0, ymmword ptr [rsi + 8*r10] LONG $0xfb75a1c4; WORD $0xd64c; BYTE $0x20 // vpsubq ymm1, ymm1, ymmword ptr [rsi + 8*r10 + 32] LONG $0xfb6da1c4; WORD $0xd654; BYTE $0x40 // vpsubq ymm2, ymm2, ymmword ptr [rsi + 8*r10 + 64] LONG $0xfb65a1c4; WORD $0xd65c; BYTE $0x60 // vpsubq ymm3, ymm3, ymmword ptr [rsi + 8*r10 + 96] LONG $0x7f7ea1c4; WORD $0xd204 // vmovdqu ymmword ptr [rdx + 8*r10], ymm0 LONG $0x7f7ea1c4; WORD $0xd24c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 8*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0xd254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 8*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0xd25c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 8*r10 + 96], ymm3 LONG $0x10c28349 // add r10, 16 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB25_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB25_12 LBB25_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB25_10 LBB25_9: LONG $0xdf048b4a // mov rax, qword ptr [rdi + 8*r11] LONG $0xde042b4a // sub rax, qword ptr [rsi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB25_9 LBB25_10: LONG $0x03f98349 // cmp r9, 3 JB LBB25_12 LBB25_11: LONG $0xdf048b4a // mov rax, qword ptr [rdi + 8*r11] LONG $0xde042b4a // sub rax, qword ptr [rsi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax LONG $0xdf448b4a; BYTE $0x08 // mov rax, qword ptr [rdi + 8*r11 + 8] LONG $0xde442b4a; BYTE $0x08 // sub rax, qword ptr [rsi + 8*r11 + 8] LONG $0xda44894a; BYTE $0x08 // mov qword ptr [rdx + 8*r11 + 8], rax LONG $0xdf448b4a; BYTE $0x10 // mov rax, qword ptr [rdi + 8*r11 + 16] LONG $0xde442b4a; BYTE $0x10 // sub rax, qword ptr [rsi + 8*r11 + 16] LONG $0xda44894a; BYTE $0x10 // mov qword ptr [rdx + 8*r11 + 16], rax LONG $0xdf448b4a; BYTE $0x18 // mov rax, qword ptr [rdi + 8*r11 + 24] LONG $0xde442b4a; BYTE $0x18 // sub rax, qword ptr [rsi + 8*r11 + 24] LONG $0xda44894a; BYTE $0x18 // mov qword ptr [rdx + 8*r11 + 24], rax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB25_11 LBB25_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint64_mul(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB26_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB26_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB26_8 LBB26_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB26_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB26_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x0fe18341 // and r9d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB26_6: LONG $0x6f7ea1c4; WORD $0xd70c // vmovdqu ymm1, ymmword ptr [rdi + 8*r10] LONG $0x6f7ea1c4; WORD $0xd754; BYTE $0x20 // vmovdqu ymm2, ymmword ptr [rdi + 8*r10 + 32] LONG $0x6f7ea1c4; WORD $0xd75c; BYTE $0x40 // vmovdqu ymm3, ymmword ptr [rdi + 8*r10 + 64] LONG $0x6f7ea1c4; WORD $0xd744; BYTE $0x60 // vmovdqu ymm0, ymmword ptr [rdi + 8*r10 + 96] LONG $0x6f7ea1c4; WORD $0xd624 // vmovdqu ymm4, ymmword ptr [rsi + 8*r10] LONG $0x6f7ea1c4; WORD $0xd66c; BYTE $0x20 // vmovdqu ymm5, ymmword ptr [rsi + 8*r10 + 32] LONG $0x6f7ea1c4; WORD $0xd674; BYTE $0x40 // vmovdqu ymm6, ymmword ptr [rsi + 8*r10 + 64] LONG $0x6f7ea1c4; WORD $0xd67c; BYTE $0x60 // vmovdqu ymm7, ymmword ptr [rsi + 8*r10 + 96] LONG $0xd473bdc5; BYTE $0x20 // vpsrlq ymm8, ymm4, 32 LONG $0xc1f43dc5 // vpmuludq ymm8, ymm8, ymm1 LONG $0xd173b5c5; BYTE $0x20 // vpsrlq ymm9, ymm1, 32 LONG $0xccf435c5 // vpmuludq ymm9, ymm9, ymm4 LONG $0xd43541c4; BYTE $0xc0 // vpaddq ymm8, ymm9, ymm8 LONG $0x733dc1c4; WORD $0x20f0 // vpsllq ymm8, ymm8, 32 LONG $0xc9f4ddc5 // vpmuludq ymm1, ymm4, ymm1 LONG $0xc9d4bdc5 // vpaddq ymm1, ymm8, ymm1 LONG $0xd573ddc5; BYTE $0x20 // vpsrlq ymm4, ymm5, 32 LONG $0xe2f4ddc5 // vpmuludq ymm4, ymm4, ymm2 LONG $0xd273bdc5; BYTE $0x20 // vpsrlq ymm8, ymm2, 32 LONG $0xc5f43dc5 // vpmuludq ymm8, ymm8, ymm5 LONG $0xe4d4bdc5 // vpaddq ymm4, ymm8, ymm4 LONG $0xf473ddc5; BYTE $0x20 // vpsllq ymm4, ymm4, 32 LONG $0xd2f4d5c5 // vpmuludq ymm2, ymm5, ymm2 LONG $0xd4d4edc5 // vpaddq ymm2, ymm2, ymm4 LONG $0xd673ddc5; BYTE $0x20 // vpsrlq ymm4, ymm6, 32 LONG $0xe3f4ddc5 // vpmuludq ymm4, ymm4, ymm3 LONG $0xd373d5c5; BYTE $0x20 // vpsrlq ymm5, ymm3, 32 LONG $0xedf4cdc5 // vpmuludq ymm5, ymm6, ymm5 LONG $0xe4d4d5c5 // vpaddq ymm4, ymm5, ymm4 LONG $0xf473ddc5; BYTE $0x20 // vpsllq ymm4, ymm4, 32 LONG $0xdbf4cdc5 // vpmuludq ymm3, ymm6, ymm3 LONG $0xdcd4e5c5 // vpaddq ymm3, ymm3, ymm4 LONG $0xd773ddc5; BYTE $0x20 // vpsrlq ymm4, ymm7, 32 LONG $0xe0f4ddc5 // vpmuludq ymm4, ymm4, ymm0 LONG $0xd073d5c5; BYTE $0x20 // vpsrlq ymm5, ymm0, 32 LONG $0xedf4c5c5 // vpmuludq ymm5, ymm7, ymm5 LONG $0xe4d4d5c5 // vpaddq ymm4, ymm5, ymm4 LONG $0xf473ddc5; BYTE $0x20 // vpsllq ymm4, ymm4, 32 LONG $0xc0f4c5c5 // vpmuludq ymm0, ymm7, ymm0 LONG $0xc4d4fdc5 // vpaddq ymm0, ymm0, ymm4 LONG $0x7f7ea1c4; WORD $0xd20c // vmovdqu ymmword ptr [rdx + 8*r10], ymm1 LONG $0x7f7ea1c4; WORD $0xd254; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 8*r10 + 32], ymm2 LONG $0x7f7ea1c4; WORD $0xd25c; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 8*r10 + 64], ymm3 LONG $0x7f7ea1c4; WORD $0xd244; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 8*r10 + 96], ymm0 LONG $0x10c28349 // add r10, 16 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB26_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB26_12 LBB26_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB26_10 LBB26_9: LONG $0xde048b4a // mov rax, qword ptr [rsi + 8*r11] LONG $0x04af0f4a; BYTE $0xdf // imul rax, qword ptr [rdi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB26_9 LBB26_10: LONG $0x03f98349 // cmp r9, 3 JB LBB26_12 LBB26_11: LONG $0xde048b4a // mov rax, qword ptr [rsi + 8*r11] LONG $0x04af0f4a; BYTE $0xdf // imul rax, qword ptr [rdi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax LONG $0xde448b4a; BYTE $0x08 // mov rax, qword ptr [rsi + 8*r11 + 8] LONG $0x44af0f4a; WORD $0x08df // imul rax, qword ptr [rdi + 8*r11 + 8] LONG $0xda44894a; BYTE $0x08 // mov qword ptr [rdx + 8*r11 + 8], rax LONG $0xde448b4a; BYTE $0x10 // mov rax, qword ptr [rsi + 8*r11 + 16] LONG $0x44af0f4a; WORD $0x10df // imul rax, qword ptr [rdi + 8*r11 + 16] LONG $0xda44894a; BYTE $0x10 // mov qword ptr [rdx + 8*r11 + 16], rax LONG $0xde448b4a; BYTE $0x18 // mov rax, qword ptr [rsi + 8*r11 + 24] LONG $0x44af0f4a; WORD $0x18df // imul rax, qword ptr [rdi + 8*r11 + 24] LONG $0xda44894a; BYTE $0x18 // mov qword ptr [rdx + 8*r11 + 24], rax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB26_11 LBB26_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_uint64_div(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp WORD $0x5641 // push r14 BYTE $0x53 // push rbx LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB27_21 WORD $0x8949; BYTE $0xd0 // mov r8, rdx WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x04fa8349 // cmp r10, 4 JAE LBB27_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB27_8 LBB27_3: WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x20f88348 // cmp rax, 32 JB LBB27_8 WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x20f88348 // cmp rax, 32 JB LBB27_8 WORD $0x8941; BYTE $0xce // mov r14d, ecx LONG $0x03e68341 // and r14d, 3 WORD $0x894d; BYTE $0xd3 // mov r11, r10 WORD $0x294d; BYTE $0xf3 // sub r11, r14 WORD $0xdb31 // xor ebx, ebx LBB27_6: LONG $0xdf0c8b4c // mov r9, qword ptr [rdi + 8*rbx] LONG $0xdf448b48; BYTE $0x08 // mov rax, qword ptr [rdi + 8*rbx + 8] WORD $0xd231 // xor edx, edx LONG $0xde74f748; BYTE $0x08 // div qword ptr [rsi + 8*rbx + 8] LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax WORD $0x894c; BYTE $0xc8 // mov rax, r9 WORD $0xd231 // xor edx, edx LONG $0xde34f748 // div qword ptr [rsi + 8*rbx] LONG $0x6ef9e1c4; BYTE $0xc8 // vmovq xmm1, rax LONG $0xc06cf1c5 // vpunpcklqdq xmm0, xmm1, xmm0 LONG $0xdf448b48; BYTE $0x18 // mov rax, qword ptr [rdi + 8*rbx + 24] WORD $0xd231 // xor edx, edx LONG $0xde74f748; BYTE $0x18 // div qword ptr [rsi + 8*rbx + 24] WORD $0x8949; BYTE $0xc1 // mov r9, rax LONG $0xdf448b48; BYTE $0x10 // mov rax, qword ptr [rdi + 8*rbx + 16] WORD $0xd231 // xor edx, edx LONG $0xde74f748; BYTE $0x10 // div qword ptr [rsi + 8*rbx + 16] LONG $0x6ef9c1c4; BYTE $0xc9 // vmovq xmm1, r9 LONG $0x6ef9e1c4; BYTE $0xd0 // vmovq xmm2, rax LONG $0xc96ce9c5 // vpunpcklqdq xmm1, xmm2, xmm1 LONG $0x7f7ac1c4; WORD $0xd84c; BYTE $0x10 // vmovdqu xmmword ptr [r8 + 8*rbx + 16], xmm1 LONG $0x7f7ac1c4; WORD $0xd804 // vmovdqu xmmword ptr [r8 + 8*rbx], xmm0 LONG $0x04c38348 // add rbx, 4 WORD $0x3949; BYTE $0xdb // cmp r11, rbx JNE LBB27_6 WORD $0x854d; BYTE $0xf6 // test r14, r14 JE LBB27_21 LBB27_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d LONG $0x015b8d49 // lea rbx, [r11 + 1] WORD $0xc1f6; BYTE $0x01 // test cl, 1 JE LBB27_13 LONG $0xdf048b4a // mov rax, qword ptr [rdi + 8*r11] LONG $0xde0c8b4a // mov rcx, qword ptr [rsi + 8*r11] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB27_10 WORD $0xd231 // xor edx, edx WORD $0xf748; BYTE $0xf1 // div rcx JMP LBB27_12 LBB27_10: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB27_12: LONG $0xd804894b // mov qword ptr [r8 + 8*r11], rax WORD $0x8949; BYTE $0xdb // mov r11, rbx LBB27_13: WORD $0x3949; BYTE $0xda // cmp r10, rbx JNE LBB27_14 JMP LBB27_21 LBB27_19: WORD $0xd231 // xor edx, edx WORD $0xf748; BYTE $0xf1 // div rcx LONG $0xd844894b; BYTE $0x08 // mov qword ptr [r8 + 8*r11 + 8], rax LONG $0x02c38349 // add r11, 2 WORD $0x394d; BYTE $0xda // cmp r10, r11 JE LBB27_21 LBB27_14: LONG $0xdf048b4a // mov rax, qword ptr [rdi + 8*r11] LONG $0xde0c8b4a // mov rcx, qword ptr [rsi + 8*r11] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB27_15 WORD $0xd231 // xor edx, edx WORD $0xf748; BYTE $0xf1 // div rcx JMP LBB27_17 LBB27_15: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB27_17: LONG $0xd804894b // mov qword ptr [r8 + 8*r11], rax LONG $0xdf448b4a; BYTE $0x08 // mov rax, qword ptr [rdi + 8*r11 + 8] LONG $0xde4c8b4a; BYTE $0x08 // mov rcx, qword ptr [rsi + 8*r11 + 8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JNE LBB27_19 WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LONG $0xd844894b; BYTE $0x08 // mov qword ptr [r8 + 8*r11 + 8], rax LONG $0x02c38349 // add r11, 2 WORD $0x394d; BYTE $0xda // cmp r10, r11 JNE LBB27_14 LBB27_21: LONG $0xf0658d48 // lea rsp, [rbp - 16] BYTE $0x5b // pop rbx WORD $0x5e41 // pop r14 BYTE $0x5d // pop rbp BYTE $0xc3 // ret TEXT ·_int8_sum(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xd285 // test edx, edx JLE LBB28_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB28_4 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d WORD $0xc031 // xor eax, eax JMP LBB28_13 LBB28_1: WORD $0xc031 // xor eax, eax JMP LBB28_14 LBB28_4: LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB28_6 WORD $0xc031 // xor eax, eax WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB28_10 LBB28_6: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x7fe18341 // and r9d, 127 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 LBB28_7: LONG $0x04fcfdc5; BYTE $0x07 // vpaddb ymm0, ymm0, ymmword ptr [rdi + rax] LONG $0x4cfcf5c5; WORD $0x2007 // vpaddb ymm1, ymm1, ymmword ptr [rdi + rax + 32] LONG $0x54fcedc5; WORD $0x4007 // vpaddb ymm2, ymm2, ymmword ptr [rdi + rax + 64] LONG $0x5cfce5c5; WORD $0x6007 // vpaddb ymm3, ymm3, ymmword ptr [rdi + rax + 96] LONG $0x80e88348 // sub rax, -128 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB28_7 LONG $0xc0fcf5c5 // vpaddb ymm0, ymm1, ymm0 LONG $0xc0fcedc5 // vpaddb ymm0, ymm2, ymm0 LONG $0xc0fce5c5 // vpaddb ymm0, ymm3, ymm0 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1fcf9c5 // vpaddb xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1fcf9c5 // vpaddb xmm0, xmm0, xmm1 LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xc1f6f9c5 // vpsadbw xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB28_14 LONG $0x10f98341 // cmp r9d, 16 JB LBB28_13 LBB28_10: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx WORD $0xb60f; BYTE $0xc0 // movzx eax, al LONG $0xc06ef9c5 // vmovd xmm0, eax LBB28_11: LONG $0x04fcf9c5; BYTE $0x0f // vpaddb xmm0, xmm0, xmmword ptr [rdi + rcx] LONG $0x10c18348 // add rcx, 16 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB28_11 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1fcf9c5 // vpaddb xmm0, xmm0, xmm1 LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xc1f6f9c5 // vpsadbw xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB28_14 LBB28_13: LONG $0x17040242 // add al, byte ptr [rdi + r10] WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB28_13 LBB28_14: WORD $0x0688 // mov byte ptr [rsi], al WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LCPI29_0: // TEXT ·_int8_min(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xb60f; BYTE $0x07 // movzx eax, byte ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB29_13 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB29_3 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB29_12 LBB29_3: LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB29_5 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB29_9 LBB29_5: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x7fe18341 // and r9d, 127 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB29_6: LONG $0x387de2c4; WORD $0x0704 // vpminsb ymm0, ymm0, ymmword ptr [rdi + rax] LONG $0x3875e2c4; WORD $0x074c; BYTE $0x20 // vpminsb ymm1, ymm1, ymmword ptr [rdi + rax + 32] LONG $0x386de2c4; WORD $0x0754; BYTE $0x40 // vpminsb ymm2, ymm2, ymmword ptr [rdi + rax + 64] LONG $0x3865e2c4; WORD $0x075c; BYTE $0x60 // vpminsb ymm3, ymm3, ymmword ptr [rdi + rax + 96] LONG $0x80e88348 // sub rax, -128 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB29_6 LONG $0x387de2c4; BYTE $0xc1 // vpminsb ymm0, ymm0, ymm1 LONG $0x387de2c4; BYTE $0xc2 // vpminsb ymm0, ymm0, ymm2 LONG $0x387de2c4; BYTE $0xc3 // vpminsb ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0x3879e2c4; BYTE $0xc1 // vpminsb xmm0, xmm0, xmm1 QUAD $0x0000008305eff9c5 // vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI29_0] LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8004 // add al, -128 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB29_13 LONG $0x10f98341 // cmp r9d, 16 JB LBB29_12 LBB29_9: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x7879e2c4; BYTE $0xc0 // vpbroadcastb xmm0, xmm0 LBB29_10: LONG $0x3879e2c4; WORD $0x0f04 // vpminsb xmm0, xmm0, xmmword ptr [rdi + rcx] LONG $0x10c18348 // add rcx, 16 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB29_10 QUAD $0x0000003805eff9c5 // vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI29_0] LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8004 // add al, -128 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB29_13 LBB29_12: LONG $0x0cb60f42; BYTE $0x17 // movzx ecx, byte ptr [rdi + r10] WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xc138 // cmp cl, al WORD $0x4c0f; BYTE $0xc1 // cmovl eax, ecx WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB29_12 LBB29_13: WORD $0x0688 // mov byte ptr [rsi], al WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LCPI30_0: // BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 BYTE $0x80 TEXT ·_int8_max(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xb60f; BYTE $0x07 // movzx eax, byte ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB30_13 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB30_3 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB30_12 LBB30_3: LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB30_5 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB30_9 LBB30_5: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x7fe18341 // and r9d, 127 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB30_6: LONG $0x3c7de2c4; WORD $0x0704 // vpmaxsb ymm0, ymm0, ymmword ptr [rdi + rax] LONG $0x3c75e2c4; WORD $0x074c; BYTE $0x20 // vpmaxsb ymm1, ymm1, ymmword ptr [rdi + rax + 32] LONG $0x3c6de2c4; WORD $0x0754; BYTE $0x40 // vpmaxsb ymm2, ymm2, ymmword ptr [rdi + rax + 64] LONG $0x3c65e2c4; WORD $0x075c; BYTE $0x60 // vpmaxsb ymm3, ymm3, ymmword ptr [rdi + rax + 96] LONG $0x80e88348 // sub rax, -128 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB30_6 LONG $0x3c7de2c4; BYTE $0xc1 // vpmaxsb ymm0, ymm0, ymm1 LONG $0x3c7de2c4; BYTE $0xc2 // vpmaxsb ymm0, ymm0, ymm2 LONG $0x3c7de2c4; BYTE $0xc3 // vpmaxsb ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0x3c79e2c4; BYTE $0xc1 // vpmaxsb xmm0, xmm0, xmm1 QUAD $0x0000008305eff9c5 // vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI30_0] LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x7f34 // xor al, 127 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB30_13 LONG $0x10f98341 // cmp r9d, 16 JB LBB30_12 LBB30_9: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x7879e2c4; BYTE $0xc0 // vpbroadcastb xmm0, xmm0 LBB30_10: LONG $0x3c79e2c4; WORD $0x0f04 // vpmaxsb xmm0, xmm0, xmmword ptr [rdi + rcx] LONG $0x10c18348 // add rcx, 16 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB30_10 QUAD $0x0000003805eff9c5 // vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI30_0] LONG $0xd071f1c5; BYTE $0x08 // vpsrlw xmm1, xmm0, 8 LONG $0xc1daf9c5 // vpminub xmm0, xmm0, xmm1 LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x7f34 // xor al, 127 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB30_13 LBB30_12: LONG $0x0cb60f42; BYTE $0x17 // movzx ecx, byte ptr [rdi + r10] WORD $0xb60f; BYTE $0xc0 // movzx eax, al WORD $0xc138 // cmp cl, al WORD $0x4f0f; BYTE $0xc1 // cmovg eax, ecx WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB30_12 LBB30_13: WORD $0x0688 // mov byte ptr [rsi], al WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f BYTE $0x7f TEXT ·_int8_add(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB31_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB31_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB31_14: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB31_16 LBB31_15: LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte ptr [rsi + r11] LONG $0x1f040242 // add al, byte ptr [rdi + r11] LONG $0x1a048842 // mov byte ptr [rdx + r11], al WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB31_15 LBB31_16: LONG $0x03f98349 // cmp r9, 3 JB LBB31_18 LBB31_17: LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte ptr [rsi + r11] LONG $0x1f040242 // add al, byte ptr [rdi + r11] LONG $0x1a048842 // mov byte ptr [rdx + r11], al LONG $0x44b60f42; WORD $0x011e // movzx eax, byte ptr [rsi + r11 + 1] LONG $0x1f440242; BYTE $0x01 // add al, byte ptr [rdi + r11 + 1] LONG $0x1a448842; BYTE $0x01 // mov byte ptr [rdx + r11 + 1], al LONG $0x44b60f42; WORD $0x021e // movzx eax, byte ptr [rsi + r11 + 2] LONG $0x1f440242; BYTE $0x02 // add al, byte ptr [rdi + r11 + 2] LONG $0x1a448842; BYTE $0x02 // mov byte ptr [rdx + r11 + 2], al LONG $0x44b60f42; WORD $0x031e // movzx eax, byte ptr [rsi + r11 + 3] LONG $0x1f440242; BYTE $0x03 // add al, byte ptr [rdi + r11 + 3] LONG $0x1a448842; BYTE $0x03 // mov byte ptr [rdx + r11 + 3], al LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB31_17 LBB31_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB31_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB31_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB31_14 LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB31_7 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB31_11 LBB31_7: WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x7fe18341 // and r9d, 127 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB31_8: LONG $0x6f7ea1c4; WORD $0x1604 // vmovdqu ymm0, ymmword ptr [rsi + r10] LONG $0x6f7ea1c4; WORD $0x164c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + r10 + 32] LONG $0x6f7ea1c4; WORD $0x1654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + r10 + 64] LONG $0x6f7ea1c4; WORD $0x165c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + r10 + 96] LONG $0xfc7da1c4; WORD $0x1704 // vpaddb ymm0, ymm0, ymmword ptr [rdi + r10] LONG $0xfc75a1c4; WORD $0x174c; BYTE $0x20 // vpaddb ymm1, ymm1, ymmword ptr [rdi + r10 + 32] LONG $0xfc6da1c4; WORD $0x1754; BYTE $0x40 // vpaddb ymm2, ymm2, ymmword ptr [rdi + r10 + 64] LONG $0xfc65a1c4; WORD $0x175c; BYTE $0x60 // vpaddb ymm3, ymm3, ymmword ptr [rdi + r10 + 96] LONG $0x7f7ea1c4; WORD $0x1204 // vmovdqu ymmword ptr [rdx + r10], ymm0 LONG $0x7f7ea1c4; WORD $0x124c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x1254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x125c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + r10 + 96], ymm3 LONG $0x80ea8349 // sub r10, -128 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB31_8 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB31_18 LONG $0x10f98341 // cmp r9d, 16 JB LBB31_14 LBB31_11: WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x0fe28341 // and r10d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xd3 // sub r11, r10 LBB31_12: LONG $0x6f7aa1c4; WORD $0x0e04 // vmovdqu xmm0, xmmword ptr [rsi + r9] LONG $0xfc79a1c4; WORD $0x0f04 // vpaddb xmm0, xmm0, xmmword ptr [rdi + r9] LONG $0x7f7aa1c4; WORD $0x0a04 // vmovdqu xmmword ptr [rdx + r9], xmm0 LONG $0x10c18349 // add r9, 16 WORD $0x394d; BYTE $0xcb // cmp r11, r9 JNE LBB31_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB31_14 JMP LBB31_18 TEXT ·_int8_sub(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB32_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB32_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB32_14: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB32_16 LBB32_15: LONG $0x04b60f42; BYTE $0x1f // movzx eax, byte ptr [rdi + r11] LONG $0x1e042a42 // sub al, byte ptr [rsi + r11] LONG $0x1a048842 // mov byte ptr [rdx + r11], al WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB32_15 LBB32_16: LONG $0x03f98349 // cmp r9, 3 JB LBB32_18 LBB32_17: LONG $0x04b60f42; BYTE $0x1f // movzx eax, byte ptr [rdi + r11] LONG $0x1e042a42 // sub al, byte ptr [rsi + r11] LONG $0x1a048842 // mov byte ptr [rdx + r11], al LONG $0x44b60f42; WORD $0x011f // movzx eax, byte ptr [rdi + r11 + 1] LONG $0x1e442a42; BYTE $0x01 // sub al, byte ptr [rsi + r11 + 1] LONG $0x1a448842; BYTE $0x01 // mov byte ptr [rdx + r11 + 1], al LONG $0x44b60f42; WORD $0x021f // movzx eax, byte ptr [rdi + r11 + 2] LONG $0x1e442a42; BYTE $0x02 // sub al, byte ptr [rsi + r11 + 2] LONG $0x1a448842; BYTE $0x02 // mov byte ptr [rdx + r11 + 2], al LONG $0x44b60f42; WORD $0x031f // movzx eax, byte ptr [rdi + r11 + 3] LONG $0x1e442a42; BYTE $0x03 // sub al, byte ptr [rsi + r11 + 3] LONG $0x1a448842; BYTE $0x03 // mov byte ptr [rdx + r11 + 3], al LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB32_17 LBB32_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB32_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB32_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB32_14 LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB32_7 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB32_11 LBB32_7: WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x7fe18341 // and r9d, 127 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB32_8: LONG $0x6f7ea1c4; WORD $0x1704 // vmovdqu ymm0, ymmword ptr [rdi + r10] LONG $0x6f7ea1c4; WORD $0x174c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rdi + r10 + 32] LONG $0x6f7ea1c4; WORD $0x1754; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rdi + r10 + 64] LONG $0x6f7ea1c4; WORD $0x175c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rdi + r10 + 96] LONG $0xf87da1c4; WORD $0x1604 // vpsubb ymm0, ymm0, ymmword ptr [rsi + r10] LONG $0xf875a1c4; WORD $0x164c; BYTE $0x20 // vpsubb ymm1, ymm1, ymmword ptr [rsi + r10 + 32] LONG $0xf86da1c4; WORD $0x1654; BYTE $0x40 // vpsubb ymm2, ymm2, ymmword ptr [rsi + r10 + 64] LONG $0xf865a1c4; WORD $0x165c; BYTE $0x60 // vpsubb ymm3, ymm3, ymmword ptr [rsi + r10 + 96] LONG $0x7f7ea1c4; WORD $0x1204 // vmovdqu ymmword ptr [rdx + r10], ymm0 LONG $0x7f7ea1c4; WORD $0x124c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x1254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x125c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + r10 + 96], ymm3 LONG $0x80ea8349 // sub r10, -128 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB32_8 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB32_18 LONG $0x10f98341 // cmp r9d, 16 JB LBB32_14 LBB32_11: WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x0fe28341 // and r10d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xd3 // sub r11, r10 LBB32_12: LONG $0x6f7aa1c4; WORD $0x0f04 // vmovdqu xmm0, xmmword ptr [rdi + r9] LONG $0xf879a1c4; WORD $0x0e04 // vpsubb xmm0, xmm0, xmmword ptr [rsi + r9] LONG $0x7f7aa1c4; WORD $0x0a04 // vmovdqu xmmword ptr [rdx + r9], xmm0 LONG $0x10c18349 // add r9, 16 WORD $0x394d; BYTE $0xcb // cmp r11, r9 JNE LBB32_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB32_14 JMP LBB32_18 LCPI33_0: // TEXT ·_int8_mul(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB33_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB33_3 WORD $0x3145; BYTE $0xc9 // xor r9d, r9d LBB33_14: WORD $0x2944; BYTE $0xc9 // sub ecx, r9d WORD $0x894d; BYTE $0xca // mov r10, r9 WORD $0xf749; BYTE $0xd2 // not r10 WORD $0x014d; BYTE $0xc2 // add r10, r8 LONG $0x03e18348 // and rcx, 3 JE LBB33_16 LBB33_15: LONG $0x04b60f42; BYTE $0x0e // movzx eax, byte ptr [rsi + r9] LONG $0x0f24f642 // mul byte ptr [rdi + r9] LONG $0x0a048842 // mov byte ptr [rdx + r9], al WORD $0xff49; BYTE $0xc1 // inc r9 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB33_15 LBB33_16: LONG $0x03fa8349 // cmp r10, 3 JB LBB33_18 LBB33_17: LONG $0x04b60f42; BYTE $0x0e // movzx eax, byte ptr [rsi + r9] LONG $0x0f24f642 // mul byte ptr [rdi + r9] LONG $0x0a048842 // mov byte ptr [rdx + r9], al LONG $0x44b60f42; WORD $0x010e // movzx eax, byte ptr [rsi + r9 + 1] LONG $0x0f64f642; BYTE $0x01 // mul byte ptr [rdi + r9 + 1] LONG $0x0a448842; BYTE $0x01 // mov byte ptr [rdx + r9 + 1], al LONG $0x44b60f42; WORD $0x020e // movzx eax, byte ptr [rsi + r9 + 2] LONG $0x0f64f642; BYTE $0x02 // mul byte ptr [rdi + r9 + 2] LONG $0x0a448842; BYTE $0x02 // mov byte ptr [rdx + r9 + 2], al LONG $0x44b60f42; WORD $0x030e // movzx eax, byte ptr [rsi + r9 + 3] LONG $0x0f64f642; BYTE $0x03 // mul byte ptr [rdi + r9 + 3] LONG $0x0a448842; BYTE $0x03 // mov byte ptr [rdx + r9 + 3], al LONG $0x04c18349 // add r9, 4 WORD $0x394d; BYTE $0xc8 // cmp r8, r9 JNE LBB33_17 LBB33_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB33_3: WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xc9 // xor r9d, r9d LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB33_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB33_14 LONG $0x80f88141; WORD $0x0000; BYTE $0x00 // cmp r8d, 128 JAE LBB33_7 WORD $0x3145; BYTE $0xc9 // xor r9d, r9d JMP LBB33_11 LBB33_7: WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x7fe28341 // and r10d, 127 WORD $0x894d; BYTE $0xc1 // mov r9, r8 WORD $0x294d; BYTE $0xd1 // sub r9, r10 WORD $0xc031 // xor eax, eax QUAD $0x0000014b056ffec5 // vmovdqu ymm0, ymmword ptr [rip + .LCPI33_0] LBB33_8: LONG $0x1c6ffec5; BYTE $0x07 // vmovdqu ymm3, ymmword ptr [rdi + rax] LONG $0x646ffec5; WORD $0x2007 // vmovdqu ymm4, ymmword ptr [rdi + rax + 32] LONG $0x6c6ffec5; WORD $0x4007 // vmovdqu ymm5, ymmword ptr [rdi + rax + 64] LONG $0x4c6ffec5; WORD $0x6007 // vmovdqu ymm1, ymmword ptr [rdi + rax + 96] LONG $0x346ffec5; BYTE $0x06 // vmovdqu ymm6, ymmword ptr [rsi + rax] LONG $0x7c6ffec5; WORD $0x2006 // vmovdqu ymm7, ymmword ptr [rsi + rax + 32] LONG $0x446f7ec5; WORD $0x4006 // vmovdqu ymm8, ymmword ptr [rsi + rax + 64] LONG $0x546ffec5; WORD $0x6006 // vmovdqu ymm2, ymmword ptr [rsi + rax + 96] LONG $0xcb6865c5 // vpunpckhbw ymm9, ymm3, ymm3 LONG $0xd6684dc5 // vpunpckhbw ymm10, ymm6, ymm6 LONG $0xd52d41c4; BYTE $0xc9 // vpmullw ymm9, ymm10, ymm9 LONG $0xc8db35c5 // vpand ymm9, ymm9, ymm0 LONG $0xdb60e5c5 // vpunpcklbw ymm3, ymm3, ymm3 LONG $0xf660cdc5 // vpunpcklbw ymm6, ymm6, ymm6 LONG $0xdbd5cdc5 // vpmullw ymm3, ymm6, ymm3 LONG $0xd8dbe5c5 // vpand ymm3, ymm3, ymm0 LONG $0x6765c1c4; BYTE $0xd9 // vpackuswb ymm3, ymm3, ymm9 LONG $0xf468ddc5 // vpunpckhbw ymm6, ymm4, ymm4 LONG $0xcf6845c5 // vpunpckhbw ymm9, ymm7, ymm7 LONG $0xf6d5b5c5 // vpmullw ymm6, ymm9, ymm6 LONG $0xf0dbcdc5 // vpand ymm6, ymm6, ymm0 LONG $0xe460ddc5 // vpunpcklbw ymm4, ymm4, ymm4 LONG $0xff60c5c5 // vpunpcklbw ymm7, ymm7, ymm7 LONG $0xe4d5c5c5 // vpmullw ymm4, ymm7, ymm4 LONG $0xe0dbddc5 // vpand ymm4, ymm4, ymm0 LONG $0xe667ddc5 // vpackuswb ymm4, ymm4, ymm6 LONG $0xf568d5c5 // vpunpckhbw ymm6, ymm5, ymm5 LONG $0x683dc1c4; BYTE $0xf8 // vpunpckhbw ymm7, ymm8, ymm8 LONG $0xf6d5c5c5 // vpmullw ymm6, ymm7, ymm6 LONG $0xf0dbcdc5 // vpand ymm6, ymm6, ymm0 LONG $0xed60d5c5 // vpunpcklbw ymm5, ymm5, ymm5 LONG $0x603dc1c4; BYTE $0xf8 // vpunpcklbw ymm7, ymm8, ymm8 LONG $0xedd5c5c5 // vpmullw ymm5, ymm7, ymm5 LONG $0xe8dbd5c5 // vpand ymm5, ymm5, ymm0 LONG $0xee67d5c5 // vpackuswb ymm5, ymm5, ymm6 LONG $0xf168f5c5 // vpunpckhbw ymm6, ymm1, ymm1 LONG $0xfa68edc5 // vpunpckhbw ymm7, ymm2, ymm2 LONG $0xf6d5c5c5 // vpmullw ymm6, ymm7, ymm6 LONG $0xf0dbcdc5 // vpand ymm6, ymm6, ymm0 LONG $0xc960f5c5 // vpunpcklbw ymm1, ymm1, ymm1 LONG $0xd260edc5 // vpunpcklbw ymm2, ymm2, ymm2 LONG $0xc9d5edc5 // vpmullw ymm1, ymm2, ymm1 LONG $0xc8dbf5c5 // vpand ymm1, ymm1, ymm0 LONG $0xce67f5c5 // vpackuswb ymm1, ymm1, ymm6 LONG $0x1c7ffec5; BYTE $0x02 // vmovdqu ymmword ptr [rdx + rax], ymm3 LONG $0x647ffec5; WORD $0x2002 // vmovdqu ymmword ptr [rdx + rax + 32], ymm4 LONG $0x6c7ffec5; WORD $0x4002 // vmovdqu ymmword ptr [rdx + rax + 64], ymm5 LONG $0x4c7ffec5; WORD $0x6002 // vmovdqu ymmword ptr [rdx + rax + 96], ymm1 LONG $0x80e88348 // sub rax, -128 WORD $0x3949; BYTE $0xc1 // cmp r9, rax JNE LBB33_8 WORD $0x854d; BYTE $0xd2 // test r10, r10 JE LBB33_18 LONG $0x10fa8341 // cmp r10d, 16 JB LBB33_14 LBB33_11: WORD $0x894c; BYTE $0xc8 // mov rax, r9 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x0fe28341 // and r10d, 15 WORD $0x894d; BYTE $0xc1 // mov r9, r8 WORD $0x294d; BYTE $0xd1 // sub r9, r10 QUAD $0x0000003a056ffec5 // vmovdqu ymm0, ymmword ptr [rip + .LCPI33_0] LBB33_12: LONG $0x307de2c4; WORD $0x070c // vpmovzxbw ymm1, xmmword ptr [rdi + rax] LONG $0x307de2c4; WORD $0x0614 // vpmovzxbw ymm2, xmmword ptr [rsi + rax] LONG $0xc9d5edc5 // vpmullw ymm1, ymm2, ymm1 LONG $0xc8dbf5c5 // vpand ymm1, ymm1, ymm0 LONG $0x397de3c4; WORD $0x01ca // vextracti128 xmm2, ymm1, 1 LONG $0xca67f1c5 // vpackuswb xmm1, xmm1, xmm2 LONG $0x0c7ffac5; BYTE $0x02 // vmovdqu xmmword ptr [rdx + rax], xmm1 LONG $0x10c08348 // add rax, 16 WORD $0x3949; BYTE $0xc1 // cmp r9, rax JNE LBB33_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB33_14 JMP LBB33_18 LCPI34_0: // WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff WORD $0x00ff TEXT ·_int8_div(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp WORD $0x5741 // push r15 WORD $0x5641 // push r14 BYTE $0x53 // push rbx LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB34_12 WORD $0x8949; BYTE $0xd0 // mov r8, rdx WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x20fa8349 // cmp r10, 32 JAE LBB34_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB34_8 LBB34_3: WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x20f88348 // cmp rax, 32 JB LBB34_8 WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x20f88348 // cmp rax, 32 JB LBB34_8 WORD $0x8941; BYTE $0xce // mov r14d, ecx LONG $0x1fe68341 // and r14d, 31 WORD $0x894d; BYTE $0xd3 // mov r11, r10 WORD $0x294d; BYTE $0xf3 // sub r11, r14 WORD $0x3145; BYTE $0xff // xor r15d, r15d QUAD $0x00035305797de2c4; BYTE $0x00 // vpbroadcastw ymm0, word ptr [rip + .LCPI34_0] LBB34_6: LONG $0x207da2c4; WORD $0x3f4c; BYTE $0x10 // vpmovsxbw ymm1, xmmword ptr [rdi + r15 + 16] LONG $0x207da2c4; WORD $0x3f24 // vpmovsxbw ymm4, xmmword ptr [rdi + r15] LONG $0x207da2c4; WORD $0x3e54; BYTE $0x10 // vpmovsxbw ymm2, xmmword ptr [rsi + r15 + 16] LONG $0x207da2c4; WORD $0x3e2c // vpmovsxbw ymm5, xmmword ptr [rsi + r15] LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 LONG $0xc3c5f9c5; BYTE $0x01 // vpextrw eax, xmm3, 1 LONG $0x397de3c4; WORD $0x01ee // vextracti128 xmm6, ymm5, 1 LONG $0xdec5f9c5; BYTE $0x01 // vpextrw ebx, xmm6, 1 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0xd87ef9c5 // vmovd eax, xmm3 LONG $0xf37ef9c5 // vmovd ebx, xmm6 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf86ef9c5 // vmovd xmm7, eax LONG $0xc441c1c4; WORD $0x01f9 // vpinsrw xmm7, xmm7, r9d, 1 LONG $0xc3c5f9c5; BYTE $0x02 // vpextrw eax, xmm3, 2 LONG $0xdec5f9c5; BYTE $0x02 // vpextrw ebx, xmm6, 2 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf8c4c1c5; BYTE $0x02 // vpinsrw xmm7, xmm7, eax, 2 LONG $0xc3c5f9c5; BYTE $0x03 // vpextrw eax, xmm3, 3 LONG $0xdec5f9c5; BYTE $0x03 // vpextrw ebx, xmm6, 3 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf8c4c1c5; BYTE $0x03 // vpinsrw xmm7, xmm7, eax, 3 LONG $0xc3c5f9c5; BYTE $0x04 // vpextrw eax, xmm3, 4 LONG $0xdec5f9c5; BYTE $0x04 // vpextrw ebx, xmm6, 4 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf8c4c1c5; BYTE $0x04 // vpinsrw xmm7, xmm7, eax, 4 LONG $0xc3c5f9c5; BYTE $0x05 // vpextrw eax, xmm3, 5 LONG $0xdec5f9c5; BYTE $0x05 // vpextrw ebx, xmm6, 5 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf8c4c1c5; BYTE $0x05 // vpinsrw xmm7, xmm7, eax, 5 LONG $0xc3c5f9c5; BYTE $0x06 // vpextrw eax, xmm3, 6 LONG $0xdec5f9c5; BYTE $0x06 // vpextrw ebx, xmm6, 6 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf8c4c1c5; BYTE $0x06 // vpinsrw xmm7, xmm7, eax, 6 LONG $0xc3c5f9c5; BYTE $0x07 // vpextrw eax, xmm3, 7 LONG $0xdec5f9c5; BYTE $0x07 // vpextrw ebx, xmm6, 7 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xd8c4c1c5; BYTE $0x07 // vpinsrw xmm3, xmm7, eax, 7 LONG $0xc4c5f9c5; BYTE $0x01 // vpextrw eax, xmm4, 1 LONG $0xddc5f9c5; BYTE $0x01 // vpextrw ebx, xmm5, 1 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0xe07ef9c5 // vmovd eax, xmm4 LONG $0xeb7ef9c5 // vmovd ebx, xmm5 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf06ef9c5 // vmovd xmm6, eax LONG $0xc449c1c4; WORD $0x01f1 // vpinsrw xmm6, xmm6, r9d, 1 LONG $0xc4c5f9c5; BYTE $0x02 // vpextrw eax, xmm4, 2 LONG $0xddc5f9c5; BYTE $0x02 // vpextrw ebx, xmm5, 2 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf0c4c9c5; BYTE $0x02 // vpinsrw xmm6, xmm6, eax, 2 LONG $0xc4c5f9c5; BYTE $0x03 // vpextrw eax, xmm4, 3 LONG $0xddc5f9c5; BYTE $0x03 // vpextrw ebx, xmm5, 3 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf0c4c9c5; BYTE $0x03 // vpinsrw xmm6, xmm6, eax, 3 LONG $0xc4c5f9c5; BYTE $0x04 // vpextrw eax, xmm4, 4 LONG $0xddc5f9c5; BYTE $0x04 // vpextrw ebx, xmm5, 4 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf0c4c9c5; BYTE $0x04 // vpinsrw xmm6, xmm6, eax, 4 LONG $0xc4c5f9c5; BYTE $0x05 // vpextrw eax, xmm4, 5 LONG $0xddc5f9c5; BYTE $0x05 // vpextrw ebx, xmm5, 5 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf0c4c9c5; BYTE $0x05 // vpinsrw xmm6, xmm6, eax, 5 LONG $0xc4c5f9c5; BYTE $0x06 // vpextrw eax, xmm4, 6 LONG $0xddc5f9c5; BYTE $0x06 // vpextrw ebx, xmm5, 6 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf0c4c9c5; BYTE $0x06 // vpinsrw xmm6, xmm6, eax, 6 LONG $0xc4c5f9c5; BYTE $0x07 // vpextrw eax, xmm4, 7 LONG $0xddc5f9c5; BYTE $0x07 // vpextrw ebx, xmm5, 7 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xe0c4c9c5; BYTE $0x07 // vpinsrw xmm4, xmm6, eax, 7 LONG $0x397de3c4; WORD $0x01cd // vextracti128 xmm5, ymm1, 1 LONG $0xc5c5f9c5; BYTE $0x01 // vpextrw eax, xmm5, 1 LONG $0x397de3c4; WORD $0x01d6 // vextracti128 xmm6, ymm2, 1 LONG $0xdec5f9c5; BYTE $0x01 // vpextrw ebx, xmm6, 1 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0xe87ef9c5 // vmovd eax, xmm5 LONG $0xf37ef9c5 // vmovd ebx, xmm6 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf86ef9c5 // vmovd xmm7, eax LONG $0xc441c1c4; WORD $0x01f9 // vpinsrw xmm7, xmm7, r9d, 1 LONG $0xc5c5f9c5; BYTE $0x02 // vpextrw eax, xmm5, 2 LONG $0xdec5f9c5; BYTE $0x02 // vpextrw ebx, xmm6, 2 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf8c4c1c5; BYTE $0x02 // vpinsrw xmm7, xmm7, eax, 2 LONG $0xc5c5f9c5; BYTE $0x03 // vpextrw eax, xmm5, 3 LONG $0xdec5f9c5; BYTE $0x03 // vpextrw ebx, xmm6, 3 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf8c4c1c5; BYTE $0x03 // vpinsrw xmm7, xmm7, eax, 3 LONG $0xc5c5f9c5; BYTE $0x04 // vpextrw eax, xmm5, 4 LONG $0xdec5f9c5; BYTE $0x04 // vpextrw ebx, xmm6, 4 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf8c4c1c5; BYTE $0x04 // vpinsrw xmm7, xmm7, eax, 4 LONG $0xc5c5f9c5; BYTE $0x05 // vpextrw eax, xmm5, 5 LONG $0xdec5f9c5; BYTE $0x05 // vpextrw ebx, xmm6, 5 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf8c4c1c5; BYTE $0x05 // vpinsrw xmm7, xmm7, eax, 5 LONG $0xc5c5f9c5; BYTE $0x06 // vpextrw eax, xmm5, 6 LONG $0xdec5f9c5; BYTE $0x06 // vpextrw ebx, xmm6, 6 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf8c4c1c5; BYTE $0x06 // vpinsrw xmm7, xmm7, eax, 6 LONG $0xc5c5f9c5; BYTE $0x07 // vpextrw eax, xmm5, 7 LONG $0xdec5f9c5; BYTE $0x07 // vpextrw ebx, xmm6, 7 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xe8c4c1c5; BYTE $0x07 // vpinsrw xmm5, xmm7, eax, 7 LONG $0xc1c5f9c5; BYTE $0x01 // vpextrw eax, xmm1, 1 LONG $0xdac5f9c5; BYTE $0x01 // vpextrw ebx, xmm2, 1 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0xc87ef9c5 // vmovd eax, xmm1 LONG $0xd37ef9c5 // vmovd ebx, xmm2 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf06ef9c5 // vmovd xmm6, eax LONG $0xc449c1c4; WORD $0x01f1 // vpinsrw xmm6, xmm6, r9d, 1 LONG $0xc1c5f9c5; BYTE $0x02 // vpextrw eax, xmm1, 2 LONG $0xdac5f9c5; BYTE $0x02 // vpextrw ebx, xmm2, 2 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf0c4c9c5; BYTE $0x02 // vpinsrw xmm6, xmm6, eax, 2 LONG $0xc1c5f9c5; BYTE $0x03 // vpextrw eax, xmm1, 3 LONG $0xdac5f9c5; BYTE $0x03 // vpextrw ebx, xmm2, 3 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf0c4c9c5; BYTE $0x03 // vpinsrw xmm6, xmm6, eax, 3 LONG $0xc1c5f9c5; BYTE $0x04 // vpextrw eax, xmm1, 4 LONG $0xdac5f9c5; BYTE $0x04 // vpextrw ebx, xmm2, 4 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf0c4c9c5; BYTE $0x04 // vpinsrw xmm6, xmm6, eax, 4 LONG $0xc1c5f9c5; BYTE $0x05 // vpextrw eax, xmm1, 5 LONG $0xdac5f9c5; BYTE $0x05 // vpextrw ebx, xmm2, 5 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf0c4c9c5; BYTE $0x05 // vpinsrw xmm6, xmm6, eax, 5 LONG $0xc1c5f9c5; BYTE $0x06 // vpextrw eax, xmm1, 6 LONG $0xdac5f9c5; BYTE $0x06 // vpextrw ebx, xmm2, 6 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xf0c4c9c5; BYTE $0x06 // vpinsrw xmm6, xmm6, eax, 6 LONG $0xc1c5f9c5; BYTE $0x07 // vpextrw eax, xmm1, 7 LONG $0xdac5f9c5; BYTE $0x07 // vpextrw ebx, xmm2, 7 LONG $0x385de3c4; WORD $0x01cb // vinserti128 ymm1, ymm4, xmm3, 1 WORD $0x9966 // cwd WORD $0xf766; BYTE $0xfb // idiv bx LONG $0xd0c4c9c5; BYTE $0x07 // vpinsrw xmm2, xmm6, eax, 7 LONG $0x386de3c4; WORD $0x01d5 // vinserti128 ymm2, ymm2, xmm5, 1 LONG $0xd0dbedc5 // vpand ymm2, ymm2, ymm0 LONG $0xc8dbf5c5 // vpand ymm1, ymm1, ymm0 LONG $0xca67f5c5 // vpackuswb ymm1, ymm1, ymm2 LONG $0x00fde3c4; WORD $0xd8c9 // vpermq ymm1, ymm1, 216 LONG $0x7f7e81c4; WORD $0x380c // vmovdqu ymmword ptr [r8 + r15], ymm1 LONG $0x20c78349 // add r15, 32 WORD $0x394d; BYTE $0xfb // cmp r11, r15 JNE LBB34_6 WORD $0x854d; BYTE $0xf6 // test r14, r14 JE LBB34_12 LBB34_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d LONG $0x015b8d49 // lea rbx, [r11 + 1] WORD $0xc1f6; BYTE $0x01 // test cl, 1 JE LBB34_10 LONG $0x04be0f42; BYTE $0x1f // movsx eax, byte ptr [rdi + r11] LONG $0x0cbe0f42; BYTE $0x1e // movsx ecx, byte ptr [rsi + r11] WORD $0x9966 // cwd WORD $0xf766; BYTE $0xf9 // idiv cx LONG $0x18048843 // mov byte ptr [r8 + r11], al WORD $0x8949; BYTE $0xdb // mov r11, rbx LBB34_10: WORD $0x3949; BYTE $0xda // cmp r10, rbx JE LBB34_12 LBB34_11: LONG $0x04be0f42; BYTE $0x1f // movsx eax, byte ptr [rdi + r11] LONG $0x0cbe0f42; BYTE $0x1e // movsx ecx, byte ptr [rsi + r11] WORD $0x9966 // cwd WORD $0xf766; BYTE $0xf9 // idiv cx LONG $0x18048843 // mov byte ptr [r8 + r11], al LONG $0x44be0f42; WORD $0x011f // movsx eax, byte ptr [rdi + r11 + 1] LONG $0x4cbe0f42; WORD $0x011e // movsx ecx, byte ptr [rsi + r11 + 1] WORD $0x9966 // cwd WORD $0xf766; BYTE $0xf9 // idiv cx LONG $0x18448843; BYTE $0x01 // mov byte ptr [r8 + r11 + 1], al LONG $0x02c38349 // add r11, 2 WORD $0x394d; BYTE $0xda // cmp r10, r11 JNE LBB34_11 LBB34_12: LONG $0xe8658d48 // lea rsp, [rbp - 24] BYTE $0x5b // pop rbx WORD $0x5e41 // pop r14 WORD $0x5f41 // pop r15 BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret WORD $0x00ff TEXT ·_int16_sum(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xd285 // test edx, edx JLE LBB35_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x08f88349 // cmp r8, 8 JAE LBB35_4 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d WORD $0xc031 // xor eax, eax JMP LBB35_13 LBB35_1: WORD $0xc031 // xor eax, eax JMP LBB35_14 LBB35_4: LONG $0x40f88341 // cmp r8d, 64 JAE LBB35_6 WORD $0xc031 // xor eax, eax WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB35_10 LBB35_6: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 LBB35_7: LONG $0x04fdfdc5; BYTE $0x47 // vpaddw ymm0, ymm0, ymmword ptr [rdi + 2*rax] LONG $0x4cfdf5c5; WORD $0x2047 // vpaddw ymm1, ymm1, ymmword ptr [rdi + 2*rax + 32] LONG $0x54fdedc5; WORD $0x4047 // vpaddw ymm2, ymm2, ymmword ptr [rdi + 2*rax + 64] LONG $0x5cfde5c5; WORD $0x6047 // vpaddw ymm3, ymm3, ymmword ptr [rdi + 2*rax + 96] LONG $0x40c08348 // add rax, 64 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB35_7 LONG $0xc0fdf5c5 // vpaddw ymm0, ymm1, ymm0 LONG $0xc0fdedc5 // vpaddw ymm0, ymm2, ymm0 LONG $0xc0fde5c5 // vpaddw ymm0, ymm3, ymm0 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x55 // vpshufd xmm1, xmm0, 85 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xd072f1c5; BYTE $0x10 // vpsrld xmm1, xmm0, 16 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB35_14 LONG $0x08f98341 // cmp r9d, 8 JB LBB35_13 LBB35_10: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x07 // and edx, 7 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx WORD $0xb70f; BYTE $0xc0 // movzx eax, ax LONG $0xc06ef9c5 // vmovd xmm0, eax LBB35_11: LONG $0x04fdf9c5; BYTE $0x4f // vpaddw xmm0, xmm0, xmmword ptr [rdi + 2*rcx] LONG $0x08c18348 // add rcx, 8 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB35_11 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x55 // vpshufd xmm1, xmm0, 85 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xd072f1c5; BYTE $0x10 // vpsrld xmm1, xmm0, 16 LONG $0xc1fdf9c5 // vpaddw xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB35_14 LBB35_13: LONG $0x04034266; BYTE $0x57 // add ax, word ptr [rdi + 2*r10] WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB35_13 LBB35_14: WORD $0x8966; BYTE $0x06 // mov word ptr [rsi], ax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LCPI36_0: // TEXT ·_int16_min(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xb70f; BYTE $0x07 // movzx eax, word ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB36_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x08f88349 // cmp r8, 8 JAE LBB36_4 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB36_14 LBB36_1: WORD $0xc189 // mov ecx, eax LBB36_15: WORD $0x8966; BYTE $0x0e // mov word ptr [rsi], cx WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB36_4: LONG $0x40f88341 // cmp r8d, 64 JAE LBB36_6 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB36_11 LBB36_6: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB36_7: LONG $0x04eafdc5; BYTE $0x47 // vpminsw ymm0, ymm0, ymmword ptr [rdi + 2*rax] LONG $0x4ceaf5c5; WORD $0x2047 // vpminsw ymm1, ymm1, ymmword ptr [rdi + 2*rax + 32] LONG $0x54eaedc5; WORD $0x4047 // vpminsw ymm2, ymm2, ymmword ptr [rdi + 2*rax + 64] LONG $0x5ceae5c5; WORD $0x6047 // vpminsw ymm3, ymm3, ymmword ptr [rdi + 2*rax + 96] LONG $0x40c08348 // add rax, 64 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB36_7 LONG $0xc1eafdc5 // vpminsw ymm0, ymm0, ymm1 LONG $0xc2eafdc5 // vpminsw ymm0, ymm0, ymm2 LONG $0xc3eafdc5 // vpminsw ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1eaf9c5 // vpminsw xmm0, xmm0, xmm1 QUAD $0x0000008405eff9c5 // vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI36_0] LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 LONG $0x00800035; BYTE $0x00 // xor eax, 32768 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB36_9 LONG $0x08f98341 // cmp r9d, 8 JB LBB36_14 LBB36_11: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x07 // and edx, 7 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x7979e2c4; BYTE $0xc0 // vpbroadcastw xmm0, xmm0 LBB36_12: LONG $0x04eaf9c5; BYTE $0x4f // vpminsw xmm0, xmm0, xmmword ptr [rdi + 2*rcx] LONG $0x08c18348 // add rcx, 8 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB36_12 QUAD $0x0000004005eff9c5 // vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI36_0] LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 LONG $0x00800035; BYTE $0x00 // xor eax, 32768 WORD $0xc189 // mov ecx, eax WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB36_15 LBB36_14: LONG $0x0cb70f42; BYTE $0x57 // movzx ecx, word ptr [rdi + 2*r10] WORD $0x3966; BYTE $0xc1 // cmp cx, ax WORD $0x4d0f; BYTE $0xc8 // cmovge ecx, eax WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0xc889 // mov eax, ecx WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB36_14 JMP LBB36_15 LBB36_9: WORD $0xc189 // mov ecx, eax WORD $0x8966; BYTE $0x0e // mov word ptr [rsi], cx WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LCPI37_0: // WORD $0x8000 WORD $0x8000 WORD $0x8000 WORD $0x8000 WORD $0x8000 WORD $0x8000 WORD $0x8000 WORD $0x8000 TEXT ·_int16_max(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xb70f; BYTE $0x07 // movzx eax, word ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB37_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x08f88349 // cmp r8, 8 JAE LBB37_4 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB37_14 LBB37_1: WORD $0xc189 // mov ecx, eax LBB37_15: WORD $0x8966; BYTE $0x0e // mov word ptr [rsi], cx WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB37_4: LONG $0x40f88341 // cmp r8d, 64 JAE LBB37_6 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB37_11 LBB37_6: WORD $0x8941; BYTE $0xd1 // mov r9d, edx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB37_7: LONG $0x04eefdc5; BYTE $0x47 // vpmaxsw ymm0, ymm0, ymmword ptr [rdi + 2*rax] LONG $0x4ceef5c5; WORD $0x2047 // vpmaxsw ymm1, ymm1, ymmword ptr [rdi + 2*rax + 32] LONG $0x54eeedc5; WORD $0x4047 // vpmaxsw ymm2, ymm2, ymmword ptr [rdi + 2*rax + 64] LONG $0x5ceee5c5; WORD $0x6047 // vpmaxsw ymm3, ymm3, ymmword ptr [rdi + 2*rax + 96] LONG $0x40c08348 // add rax, 64 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB37_7 LONG $0xc1eefdc5 // vpmaxsw ymm0, ymm0, ymm1 LONG $0xc2eefdc5 // vpmaxsw ymm0, ymm0, ymm2 LONG $0xc3eefdc5 // vpmaxsw ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1eef9c5 // vpmaxsw xmm0, xmm0, xmm1 QUAD $0x0000008405eff9c5 // vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI37_0] LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 LONG $0x007fff35; BYTE $0x00 // xor eax, 32767 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB37_9 LONG $0x08f98341 // cmp r9d, 8 JB LBB37_14 LBB37_11: WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0xe283; BYTE $0x07 // and edx, 7 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x2949; BYTE $0xd2 // sub r10, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x7979e2c4; BYTE $0xc0 // vpbroadcastw xmm0, xmm0 LBB37_12: LONG $0x04eef9c5; BYTE $0x4f // vpmaxsw xmm0, xmm0, xmmword ptr [rdi + 2*rcx] LONG $0x08c18348 // add rcx, 8 WORD $0x3949; BYTE $0xca // cmp r10, rcx JNE LBB37_12 QUAD $0x0000004005eff9c5 // vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI37_0] LONG $0x4179e2c4; BYTE $0xc0 // vphminposuw xmm0, xmm0 LONG $0xc07ef9c5 // vmovd eax, xmm0 LONG $0x007fff35; BYTE $0x00 // xor eax, 32767 WORD $0xc189 // mov ecx, eax WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB37_15 LBB37_14: LONG $0x0cb70f42; BYTE $0x57 // movzx ecx, word ptr [rdi + 2*r10] WORD $0x3966; BYTE $0xc1 // cmp cx, ax WORD $0x4e0f; BYTE $0xc8 // cmovle ecx, eax WORD $0xff49; BYTE $0xc2 // inc r10 WORD $0xc889 // mov eax, ecx WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB37_14 JMP LBB37_15 LBB37_9: WORD $0xc189 // mov ecx, eax WORD $0x8966; BYTE $0x0e // mov word ptr [rsi], cx WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret WORD $0x7fff WORD $0x7fff WORD $0x7fff WORD $0x7fff WORD $0x7fff WORD $0x7fff WORD $0x7fff WORD $0x7fff TEXT ·_int16_add(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB38_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x08f88349 // cmp r8, 8 JAE LBB38_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB38_14: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB38_16 LBB38_15: LONG $0x04b70f42; BYTE $0x5e // movzx eax, word ptr [rsi + 2*r11] LONG $0x04034266; BYTE $0x5f // add ax, word ptr [rdi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB38_15 LBB38_16: LONG $0x03f98349 // cmp r9, 3 JB LBB38_18 LBB38_17: LONG $0x04b70f42; BYTE $0x5e // movzx eax, word ptr [rsi + 2*r11] LONG $0x04034266; BYTE $0x5f // add ax, word ptr [rdi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax LONG $0x44b70f42; WORD $0x025e // movzx eax, word ptr [rsi + 2*r11 + 2] LONG $0x44034266; WORD $0x025f // add ax, word ptr [rdi + 2*r11 + 2] LONG $0x44894266; WORD $0x025a // mov word ptr [rdx + 2*r11 + 2], ax LONG $0x44b70f42; WORD $0x045e // movzx eax, word ptr [rsi + 2*r11 + 4] LONG $0x44034266; WORD $0x045f // add ax, word ptr [rdi + 2*r11 + 4] LONG $0x44894266; WORD $0x045a // mov word ptr [rdx + 2*r11 + 4], ax LONG $0x44b70f42; WORD $0x065e // movzx eax, word ptr [rsi + 2*r11 + 6] LONG $0x44034266; WORD $0x065f // add ax, word ptr [rdi + 2*r11 + 6] LONG $0x44894266; WORD $0x065a // mov word ptr [rdx + 2*r11 + 6], ax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB38_17 LBB38_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB38_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB38_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB38_14 LONG $0x40f88341 // cmp r8d, 64 JAE LBB38_7 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB38_11 LBB38_7: WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB38_8: LONG $0x6f7ea1c4; WORD $0x5604 // vmovdqu ymm0, ymmword ptr [rsi + 2*r10] LONG $0x6f7ea1c4; WORD $0x564c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + 2*r10 + 32] LONG $0x6f7ea1c4; WORD $0x5654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + 2*r10 + 64] LONG $0x6f7ea1c4; WORD $0x565c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + 2*r10 + 96] LONG $0xfd7da1c4; WORD $0x5704 // vpaddw ymm0, ymm0, ymmword ptr [rdi + 2*r10] LONG $0xfd75a1c4; WORD $0x574c; BYTE $0x20 // vpaddw ymm1, ymm1, ymmword ptr [rdi + 2*r10 + 32] LONG $0xfd6da1c4; WORD $0x5754; BYTE $0x40 // vpaddw ymm2, ymm2, ymmword ptr [rdi + 2*r10 + 64] LONG $0xfd65a1c4; WORD $0x575c; BYTE $0x60 // vpaddw ymm3, ymm3, ymmword ptr [rdi + 2*r10 + 96] LONG $0x7f7ea1c4; WORD $0x5204 // vmovdqu ymmword ptr [rdx + 2*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x524c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 2*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x5254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 2*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x525c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 2*r10 + 96], ymm3 LONG $0x40c28349 // add r10, 64 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB38_8 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB38_18 LONG $0x08f98341 // cmp r9d, 8 JB LBB38_14 LBB38_11: WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x07e28341 // and r10d, 7 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xd3 // sub r11, r10 LBB38_12: LONG $0x6f7aa1c4; WORD $0x4e04 // vmovdqu xmm0, xmmword ptr [rsi + 2*r9] LONG $0xfd79a1c4; WORD $0x4f04 // vpaddw xmm0, xmm0, xmmword ptr [rdi + 2*r9] LONG $0x7f7aa1c4; WORD $0x4a04 // vmovdqu xmmword ptr [rdx + 2*r9], xmm0 LONG $0x08c18349 // add r9, 8 WORD $0x394d; BYTE $0xcb // cmp r11, r9 JNE LBB38_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB38_14 JMP LBB38_18 TEXT ·_int16_sub(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB39_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x08f88349 // cmp r8, 8 JAE LBB39_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB39_14: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB39_16 LBB39_15: LONG $0x04b70f42; BYTE $0x5f // movzx eax, word ptr [rdi + 2*r11] LONG $0x042b4266; BYTE $0x5e // sub ax, word ptr [rsi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB39_15 LBB39_16: LONG $0x03f98349 // cmp r9, 3 JB LBB39_18 LBB39_17: LONG $0x04b70f42; BYTE $0x5f // movzx eax, word ptr [rdi + 2*r11] LONG $0x042b4266; BYTE $0x5e // sub ax, word ptr [rsi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax LONG $0x44b70f42; WORD $0x025f // movzx eax, word ptr [rdi + 2*r11 + 2] LONG $0x442b4266; WORD $0x025e // sub ax, word ptr [rsi + 2*r11 + 2] LONG $0x44894266; WORD $0x025a // mov word ptr [rdx + 2*r11 + 2], ax LONG $0x44b70f42; WORD $0x045f // movzx eax, word ptr [rdi + 2*r11 + 4] LONG $0x442b4266; WORD $0x045e // sub ax, word ptr [rsi + 2*r11 + 4] LONG $0x44894266; WORD $0x045a // mov word ptr [rdx + 2*r11 + 4], ax LONG $0x44b70f42; WORD $0x065f // movzx eax, word ptr [rdi + 2*r11 + 6] LONG $0x442b4266; WORD $0x065e // sub ax, word ptr [rsi + 2*r11 + 6] LONG $0x44894266; WORD $0x065a // mov word ptr [rdx + 2*r11 + 6], ax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB39_17 LBB39_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB39_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB39_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB39_14 LONG $0x40f88341 // cmp r8d, 64 JAE LBB39_7 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB39_11 LBB39_7: WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB39_8: LONG $0x6f7ea1c4; WORD $0x5704 // vmovdqu ymm0, ymmword ptr [rdi + 2*r10] LONG $0x6f7ea1c4; WORD $0x574c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rdi + 2*r10 + 32] LONG $0x6f7ea1c4; WORD $0x5754; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rdi + 2*r10 + 64] LONG $0x6f7ea1c4; WORD $0x575c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rdi + 2*r10 + 96] LONG $0xf97da1c4; WORD $0x5604 // vpsubw ymm0, ymm0, ymmword ptr [rsi + 2*r10] LONG $0xf975a1c4; WORD $0x564c; BYTE $0x20 // vpsubw ymm1, ymm1, ymmword ptr [rsi + 2*r10 + 32] LONG $0xf96da1c4; WORD $0x5654; BYTE $0x40 // vpsubw ymm2, ymm2, ymmword ptr [rsi + 2*r10 + 64] LONG $0xf965a1c4; WORD $0x565c; BYTE $0x60 // vpsubw ymm3, ymm3, ymmword ptr [rsi + 2*r10 + 96] LONG $0x7f7ea1c4; WORD $0x5204 // vmovdqu ymmword ptr [rdx + 2*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x524c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 2*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x5254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 2*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x525c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 2*r10 + 96], ymm3 LONG $0x40c28349 // add r10, 64 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB39_8 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB39_18 LONG $0x08f98341 // cmp r9d, 8 JB LBB39_14 LBB39_11: WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x07e28341 // and r10d, 7 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xd3 // sub r11, r10 LBB39_12: LONG $0x6f7aa1c4; WORD $0x4f04 // vmovdqu xmm0, xmmword ptr [rdi + 2*r9] LONG $0xf979a1c4; WORD $0x4e04 // vpsubw xmm0, xmm0, xmmword ptr [rsi + 2*r9] LONG $0x7f7aa1c4; WORD $0x4a04 // vmovdqu xmmword ptr [rdx + 2*r9], xmm0 LONG $0x08c18349 // add r9, 8 WORD $0x394d; BYTE $0xcb // cmp r11, r9 JNE LBB39_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB39_14 JMP LBB39_18 TEXT ·_int16_mul(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB40_18 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x08f88349 // cmp r8, 8 JAE LBB40_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB40_14: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB40_16 LBB40_15: LONG $0x04b70f42; BYTE $0x5e // movzx eax, word ptr [rsi + 2*r11] LONG $0xaf0f4266; WORD $0x5f04 // imul ax, word ptr [rdi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB40_15 LBB40_16: LONG $0x03f98349 // cmp r9, 3 JB LBB40_18 LBB40_17: LONG $0x04b70f42; BYTE $0x5e // movzx eax, word ptr [rsi + 2*r11] LONG $0xaf0f4266; WORD $0x5f04 // imul ax, word ptr [rdi + 2*r11] LONG $0x04894266; BYTE $0x5a // mov word ptr [rdx + 2*r11], ax LONG $0x44b70f42; WORD $0x025e // movzx eax, word ptr [rsi + 2*r11 + 2] LONG $0xaf0f4266; WORD $0x5f44; BYTE $0x02 // imul ax, word ptr [rdi + 2*r11 + 2] LONG $0x44894266; WORD $0x025a // mov word ptr [rdx + 2*r11 + 2], ax LONG $0x44b70f42; WORD $0x045e // movzx eax, word ptr [rsi + 2*r11 + 4] LONG $0xaf0f4266; WORD $0x5f44; BYTE $0x04 // imul ax, word ptr [rdi + 2*r11 + 4] LONG $0x44894266; WORD $0x045a // mov word ptr [rdx + 2*r11 + 4], ax LONG $0x44b70f42; WORD $0x065e // movzx eax, word ptr [rsi + 2*r11 + 6] LONG $0xaf0f4266; WORD $0x5f44; BYTE $0x06 // imul ax, word ptr [rdi + 2*r11 + 6] LONG $0x44894266; WORD $0x065a // mov word ptr [rdx + 2*r11 + 6], ax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB40_17 LBB40_18: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret LBB40_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB40_14 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB40_14 LONG $0x40f88341 // cmp r8d, 64 JAE LBB40_7 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB40_11 LBB40_7: WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x3fe18341 // and r9d, 63 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB40_8: LONG $0x6f7ea1c4; WORD $0x5604 // vmovdqu ymm0, ymmword ptr [rsi + 2*r10] LONG $0x6f7ea1c4; WORD $0x564c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + 2*r10 + 32] LONG $0x6f7ea1c4; WORD $0x5654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + 2*r10 + 64] LONG $0x6f7ea1c4; WORD $0x565c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + 2*r10 + 96] LONG $0xd57da1c4; WORD $0x5704 // vpmullw ymm0, ymm0, ymmword ptr [rdi + 2*r10] LONG $0xd575a1c4; WORD $0x574c; BYTE $0x20 // vpmullw ymm1, ymm1, ymmword ptr [rdi + 2*r10 + 32] LONG $0xd56da1c4; WORD $0x5754; BYTE $0x40 // vpmullw ymm2, ymm2, ymmword ptr [rdi + 2*r10 + 64] LONG $0xd565a1c4; WORD $0x575c; BYTE $0x60 // vpmullw ymm3, ymm3, ymmword ptr [rdi + 2*r10 + 96] LONG $0x7f7ea1c4; WORD $0x5204 // vmovdqu ymmword ptr [rdx + 2*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x524c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 2*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x5254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 2*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x525c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 2*r10 + 96], ymm3 LONG $0x40c28349 // add r10, 64 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB40_8 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB40_18 LONG $0x08f98341 // cmp r9d, 8 JB LBB40_14 LBB40_11: WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x07e28341 // and r10d, 7 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xd3 // sub r11, r10 LBB40_12: LONG $0x6f7aa1c4; WORD $0x4e04 // vmovdqu xmm0, xmmword ptr [rsi + 2*r9] LONG $0xd579a1c4; WORD $0x4f04 // vpmullw xmm0, xmm0, xmmword ptr [rdi + 2*r9] LONG $0x7f7aa1c4; WORD $0x4a04 // vmovdqu xmmword ptr [rdx + 2*r9], xmm0 LONG $0x08c18349 // add r9, 8 WORD $0x394d; BYTE $0xcb // cmp r11, r9 JNE LBB40_12 WORD $0x854d; BYTE $0xd2 // test r10, r10 JNE LBB40_14 JMP LBB40_18 TEXT ·_int16_div(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp WORD $0x5741 // push r15 WORD $0x5641 // push r14 BYTE $0x53 // push rbx LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB41_12 WORD $0x8949; BYTE $0xd0 // mov r8, rdx WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x10fa8349 // cmp r10, 16 JAE LBB41_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB41_8 LBB41_3: WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x20f88348 // cmp rax, 32 JB LBB41_8 WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x20f88348 // cmp rax, 32 JB LBB41_8 WORD $0x8941; BYTE $0xce // mov r14d, ecx LONG $0x0fe68341 // and r14d, 15 WORD $0x894d; BYTE $0xd3 // mov r11, r10 WORD $0x294d; BYTE $0xf3 // sub r11, r14 WORD $0x3145; BYTE $0xff // xor r15d, r15d LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 LBB41_6: LONG $0x237da2c4; WORD $0x7f4c; BYTE $0x10 // vpmovsxwd ymm1, xmmword ptr [rdi + 2*r15 + 16] LONG $0x237da2c4; WORD $0x7f24 // vpmovsxwd ymm4, xmmword ptr [rdi + 2*r15] LONG $0x237da2c4; WORD $0x7e54; BYTE $0x10 // vpmovsxwd ymm2, xmmword ptr [rsi + 2*r15 + 16] LONG $0x237da2c4; WORD $0x7e2c // vpmovsxwd ymm5, xmmword ptr [rsi + 2*r15] LONG $0x397de3c4; WORD $0x01e3 // vextracti128 xmm3, ymm4, 1 LONG $0x1679e3c4; WORD $0x01d8 // vpextrd eax, xmm3, 1 LONG $0x397de3c4; WORD $0x01ee // vextracti128 xmm6, ymm5, 1 LONG $0x1679e3c4; WORD $0x01f3 // vpextrd ebx, xmm6, 1 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0xd87ef9c5 // vmovd eax, xmm3 LONG $0xf37ef9c5 // vmovd ebx, xmm6 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0xf86ef9c5 // vmovd xmm7, eax LONG $0x2241c3c4; WORD $0x01f9 // vpinsrd xmm7, xmm7, r9d, 1 LONG $0x1679e3c4; WORD $0x02d8 // vpextrd eax, xmm3, 2 LONG $0x1679e3c4; WORD $0x02f3 // vpextrd ebx, xmm6, 2 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0x2241e3c4; WORD $0x02f8 // vpinsrd xmm7, xmm7, eax, 2 LONG $0x1679e3c4; WORD $0x03d8 // vpextrd eax, xmm3, 3 LONG $0x1679e3c4; WORD $0x03f3 // vpextrd ebx, xmm6, 3 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0x2241e3c4; WORD $0x03d8 // vpinsrd xmm3, xmm7, eax, 3 LONG $0x1679e3c4; WORD $0x01e0 // vpextrd eax, xmm4, 1 LONG $0x1679e3c4; WORD $0x01eb // vpextrd ebx, xmm5, 1 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0xe07ef9c5 // vmovd eax, xmm4 LONG $0xeb7ef9c5 // vmovd ebx, xmm5 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0xf06ef9c5 // vmovd xmm6, eax LONG $0x2249c3c4; WORD $0x01f1 // vpinsrd xmm6, xmm6, r9d, 1 LONG $0x1679e3c4; WORD $0x02e0 // vpextrd eax, xmm4, 2 LONG $0x1679e3c4; WORD $0x02eb // vpextrd ebx, xmm5, 2 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0x2249e3c4; WORD $0x02f0 // vpinsrd xmm6, xmm6, eax, 2 LONG $0x1679e3c4; WORD $0x03e0 // vpextrd eax, xmm4, 3 LONG $0x1679e3c4; WORD $0x03eb // vpextrd ebx, xmm5, 3 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0x2249e3c4; WORD $0x03e0 // vpinsrd xmm4, xmm6, eax, 3 LONG $0x397de3c4; WORD $0x01cd // vextracti128 xmm5, ymm1, 1 LONG $0x1679e3c4; WORD $0x01e8 // vpextrd eax, xmm5, 1 LONG $0x397de3c4; WORD $0x01d6 // vextracti128 xmm6, ymm2, 1 LONG $0x1679e3c4; WORD $0x01f3 // vpextrd ebx, xmm6, 1 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0xe87ef9c5 // vmovd eax, xmm5 LONG $0xf37ef9c5 // vmovd ebx, xmm6 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0xf86ef9c5 // vmovd xmm7, eax LONG $0x2241c3c4; WORD $0x01f9 // vpinsrd xmm7, xmm7, r9d, 1 LONG $0x1679e3c4; WORD $0x02e8 // vpextrd eax, xmm5, 2 LONG $0x1679e3c4; WORD $0x02f3 // vpextrd ebx, xmm6, 2 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0x2241e3c4; WORD $0x02f8 // vpinsrd xmm7, xmm7, eax, 2 LONG $0x1679e3c4; WORD $0x03e8 // vpextrd eax, xmm5, 3 LONG $0x1679e3c4; WORD $0x03f3 // vpextrd ebx, xmm6, 3 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0x2241e3c4; WORD $0x03e8 // vpinsrd xmm5, xmm7, eax, 3 LONG $0x1679e3c4; WORD $0x01c8 // vpextrd eax, xmm1, 1 LONG $0x385de3c4; WORD $0x01db // vinserti128 ymm3, ymm4, xmm3, 1 LONG $0x1679e3c4; WORD $0x01d3 // vpextrd ebx, xmm2, 1 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0xc87ef9c5 // vmovd eax, xmm1 LONG $0xd37ef9c5 // vmovd ebx, xmm2 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0xe06ef9c5 // vmovd xmm4, eax LONG $0x2259c3c4; WORD $0x01e1 // vpinsrd xmm4, xmm4, r9d, 1 LONG $0x1679e3c4; WORD $0x02c8 // vpextrd eax, xmm1, 2 LONG $0x1679e3c4; WORD $0x02d3 // vpextrd ebx, xmm2, 2 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0x2259e3c4; WORD $0x02e0 // vpinsrd xmm4, xmm4, eax, 2 LONG $0x1679e3c4; WORD $0x03c8 // vpextrd eax, xmm1, 3 LONG $0x1679e3c4; WORD $0x03d3 // vpextrd ebx, xmm2, 3 BYTE $0x99 // cdq WORD $0xfbf7 // idiv ebx LONG $0x2259e3c4; WORD $0x03c8 // vpinsrd xmm1, xmm4, eax, 3 LONG $0x3875e3c4; WORD $0x01cd // vinserti128 ymm1, ymm1, xmm5, 1 LONG $0x0e75e3c4; WORD $0xaac8 // vpblendw ymm1, ymm1, ymm0, 170 LONG $0x0e65e3c4; WORD $0xaad0 // vpblendw ymm2, ymm3, ymm0, 170 LONG $0x2b6de2c4; BYTE $0xc9 // vpackusdw ymm1, ymm2, ymm1 LONG $0x00fde3c4; WORD $0xd8c9 // vpermq ymm1, ymm1, 216 LONG $0x7f7e81c4; WORD $0x780c // vmovdqu ymmword ptr [r8 + 2*r15], ymm1 LONG $0x10c78349 // add r15, 16 WORD $0x394d; BYTE $0xfb // cmp r11, r15 JNE LBB41_6 WORD $0x854d; BYTE $0xf6 // test r14, r14 JE LBB41_12 LBB41_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d LONG $0x015b8d49 // lea rbx, [r11 + 1] WORD $0xc1f6; BYTE $0x01 // test cl, 1 JE LBB41_10 LONG $0x04bf0f42; BYTE $0x5f // movsx eax, word ptr [rdi + 2*r11] LONG $0x0cbf0f42; BYTE $0x5e // movsx ecx, word ptr [rsi + 2*r11] BYTE $0x99 // cdq WORD $0xf9f7 // idiv ecx LONG $0x04894366; BYTE $0x58 // mov word ptr [r8 + 2*r11], ax WORD $0x8949; BYTE $0xdb // mov r11, rbx LBB41_10: WORD $0x3949; BYTE $0xda // cmp r10, rbx JE LBB41_12 LBB41_11: LONG $0x04bf0f42; BYTE $0x5f // movsx eax, word ptr [rdi + 2*r11] LONG $0x0cbf0f42; BYTE $0x5e // movsx ecx, word ptr [rsi + 2*r11] BYTE $0x99 // cdq WORD $0xf9f7 // idiv ecx LONG $0x04894366; BYTE $0x58 // mov word ptr [r8 + 2*r11], ax LONG $0x44bf0f42; WORD $0x025f // movsx eax, word ptr [rdi + 2*r11 + 2] LONG $0x4cbf0f42; WORD $0x025e // movsx ecx, word ptr [rsi + 2*r11 + 2] BYTE $0x99 // cdq WORD $0xf9f7 // idiv ecx LONG $0x44894366; WORD $0x0258 // mov word ptr [r8 + 2*r11 + 2], ax LONG $0x02c38349 // add r11, 2 WORD $0x394d; BYTE $0xda // cmp r10, r11 JNE LBB41_11 LBB41_12: LONG $0xe8658d48 // lea rsp, [rbp - 24] BYTE $0x5b // pop rbx WORD $0x5e41 // pop r14 WORD $0x5f41 // pop r15 BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int32_sum(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xd285 // test edx, edx JLE LBB42_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x20f88349 // cmp r8, 32 JAE LBB42_4 WORD $0xc931 // xor ecx, ecx WORD $0xc031 // xor eax, eax JMP LBB42_7 LBB42_1: WORD $0xc031 // xor eax, eax JMP LBB42_8 LBB42_4: WORD $0xe283; BYTE $0x1f // and edx, 31 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 LBB42_5: LONG $0x04fefdc5; BYTE $0x87 // vpaddd ymm0, ymm0, ymmword ptr [rdi + 4*rax] LONG $0x4cfef5c5; WORD $0x2087 // vpaddd ymm1, ymm1, ymmword ptr [rdi + 4*rax + 32] LONG $0x54feedc5; WORD $0x4087 // vpaddd ymm2, ymm2, ymmword ptr [rdi + 4*rax + 64] LONG $0x5cfee5c5; WORD $0x6087 // vpaddd ymm3, ymm3, ymmword ptr [rdi + 4*rax + 96] LONG $0x20c08348 // add rax, 32 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB42_5 LONG $0xc0fef5c5 // vpaddd ymm0, ymm1, ymm0 LONG $0xc0feedc5 // vpaddd ymm0, ymm2, ymm0 LONG $0xc0fee5c5 // vpaddd ymm0, ymm3, ymm0 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1fef9c5 // vpaddd xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1fef9c5 // vpaddd xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x55 // vpshufd xmm1, xmm0, 85 LONG $0xc1fef9c5 // vpaddd xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB42_8 LBB42_7: WORD $0x0403; BYTE $0x8f // add eax, dword ptr [rdi + 4*rcx] WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB42_7 LBB42_8: WORD $0x0689 // mov dword ptr [rsi], eax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int32_min(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0x078b // mov eax, dword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB43_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x20f88349 // cmp r8, 32 JAE LBB43_3 WORD $0xc931 // xor ecx, ecx JMP LBB43_6 LBB43_3: WORD $0xe283; BYTE $0x1f // and edx, 31 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB43_4: LONG $0x397de2c4; WORD $0x8704 // vpminsd ymm0, ymm0, ymmword ptr [rdi + 4*rax] LONG $0x3975e2c4; WORD $0x874c; BYTE $0x20 // vpminsd ymm1, ymm1, ymmword ptr [rdi + 4*rax + 32] LONG $0x396de2c4; WORD $0x8754; BYTE $0x40 // vpminsd ymm2, ymm2, ymmword ptr [rdi + 4*rax + 64] LONG $0x3965e2c4; WORD $0x875c; BYTE $0x60 // vpminsd ymm3, ymm3, ymmword ptr [rdi + 4*rax + 96] LONG $0x20c08348 // add rax, 32 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB43_4 LONG $0x397de2c4; BYTE $0xc1 // vpminsd ymm0, ymm0, ymm1 LONG $0x397de2c4; BYTE $0xc2 // vpminsd ymm0, ymm0, ymm2 LONG $0x397de2c4; BYTE $0xc3 // vpminsd ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x55 // vpshufd xmm1, xmm0, 85 LONG $0x3979e2c4; BYTE $0xc1 // vpminsd xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB43_7 LBB43_6: WORD $0x148b; BYTE $0x8f // mov edx, dword ptr [rdi + 4*rcx] WORD $0xc239 // cmp edx, eax WORD $0x4c0f; BYTE $0xc2 // cmovl eax, edx WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB43_6 LBB43_7: WORD $0x0689 // mov dword ptr [rsi], eax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int32_max(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0x078b // mov eax, dword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB44_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x20f88349 // cmp r8, 32 JAE LBB44_3 WORD $0xc931 // xor ecx, ecx JMP LBB44_6 LBB44_3: WORD $0xe283; BYTE $0x1f // and edx, 31 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LBB44_4: LONG $0x3d7de2c4; WORD $0x8704 // vpmaxsd ymm0, ymm0, ymmword ptr [rdi + 4*rax] LONG $0x3d75e2c4; WORD $0x874c; BYTE $0x20 // vpmaxsd ymm1, ymm1, ymmword ptr [rdi + 4*rax + 32] LONG $0x3d6de2c4; WORD $0x8754; BYTE $0x40 // vpmaxsd ymm2, ymm2, ymmword ptr [rdi + 4*rax + 64] LONG $0x3d65e2c4; WORD $0x875c; BYTE $0x60 // vpmaxsd ymm3, ymm3, ymmword ptr [rdi + 4*rax + 96] LONG $0x20c08348 // add rax, 32 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB44_4 LONG $0x3d7de2c4; BYTE $0xc1 // vpmaxsd ymm0, ymm0, ymm1 LONG $0x3d7de2c4; BYTE $0xc2 // vpmaxsd ymm0, ymm0, ymm2 LONG $0x3d7de2c4; BYTE $0xc3 // vpmaxsd ymm0, ymm0, ymm3 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0x3d79e2c4; BYTE $0xc1 // vpmaxsd xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0x3d79e2c4; BYTE $0xc1 // vpmaxsd xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0x55 // vpshufd xmm1, xmm0, 85 LONG $0x3d79e2c4; BYTE $0xc1 // vpmaxsd xmm0, xmm0, xmm1 LONG $0xc07ef9c5 // vmovd eax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB44_7 LBB44_6: WORD $0x148b; BYTE $0x8f // mov edx, dword ptr [rdi + 4*rcx] WORD $0xc239 // cmp edx, eax WORD $0x4f0f; BYTE $0xc2 // cmovg eax, edx WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB44_6 LBB44_7: WORD $0x0689 // mov dword ptr [rsi], eax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int32_add(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB45_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x20f88349 // cmp r8, 32 JAE LBB45_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB45_8 LBB45_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB45_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB45_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x1fe18341 // and r9d, 31 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB45_6: LONG $0x6f7ea1c4; WORD $0x9604 // vmovdqu ymm0, ymmword ptr [rsi + 4*r10] LONG $0x6f7ea1c4; WORD $0x964c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + 4*r10 + 32] LONG $0x6f7ea1c4; WORD $0x9654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + 4*r10 + 64] LONG $0x6f7ea1c4; WORD $0x965c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + 4*r10 + 96] LONG $0xfe7da1c4; WORD $0x9704 // vpaddd ymm0, ymm0, ymmword ptr [rdi + 4*r10] LONG $0xfe75a1c4; WORD $0x974c; BYTE $0x20 // vpaddd ymm1, ymm1, ymmword ptr [rdi + 4*r10 + 32] LONG $0xfe6da1c4; WORD $0x9754; BYTE $0x40 // vpaddd ymm2, ymm2, ymmword ptr [rdi + 4*r10 + 64] LONG $0xfe65a1c4; WORD $0x975c; BYTE $0x60 // vpaddd ymm3, ymm3, ymmword ptr [rdi + 4*r10 + 96] LONG $0x7f7ea1c4; WORD $0x9204 // vmovdqu ymmword ptr [rdx + 4*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x924c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 4*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x9254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 4*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x925c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 4*r10 + 96], ymm3 LONG $0x20c28349 // add r10, 32 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB45_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB45_12 LBB45_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB45_10 LBB45_9: LONG $0x9e048b42 // mov eax, dword ptr [rsi + 4*r11] LONG $0x9f040342 // add eax, dword ptr [rdi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB45_9 LBB45_10: LONG $0x03f98349 // cmp r9, 3 JB LBB45_12 LBB45_11: LONG $0x9e048b42 // mov eax, dword ptr [rsi + 4*r11] LONG $0x9f040342 // add eax, dword ptr [rdi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax LONG $0x9e448b42; BYTE $0x04 // mov eax, dword ptr [rsi + 4*r11 + 4] LONG $0x9f440342; BYTE $0x04 // add eax, dword ptr [rdi + 4*r11 + 4] LONG $0x9a448942; BYTE $0x04 // mov dword ptr [rdx + 4*r11 + 4], eax LONG $0x9e448b42; BYTE $0x08 // mov eax, dword ptr [rsi + 4*r11 + 8] LONG $0x9f440342; BYTE $0x08 // add eax, dword ptr [rdi + 4*r11 + 8] LONG $0x9a448942; BYTE $0x08 // mov dword ptr [rdx + 4*r11 + 8], eax LONG $0x9e448b42; BYTE $0x0c // mov eax, dword ptr [rsi + 4*r11 + 12] LONG $0x9f440342; BYTE $0x0c // add eax, dword ptr [rdi + 4*r11 + 12] LONG $0x9a448942; BYTE $0x0c // mov dword ptr [rdx + 4*r11 + 12], eax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB45_11 LBB45_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int32_sub(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB46_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x20f88349 // cmp r8, 32 JAE LBB46_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB46_8 LBB46_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB46_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB46_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x1fe18341 // and r9d, 31 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB46_6: LONG $0x6f7ea1c4; WORD $0x9704 // vmovdqu ymm0, ymmword ptr [rdi + 4*r10] LONG $0x6f7ea1c4; WORD $0x974c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rdi + 4*r10 + 32] LONG $0x6f7ea1c4; WORD $0x9754; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rdi + 4*r10 + 64] LONG $0x6f7ea1c4; WORD $0x975c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rdi + 4*r10 + 96] LONG $0xfa7da1c4; WORD $0x9604 // vpsubd ymm0, ymm0, ymmword ptr [rsi + 4*r10] LONG $0xfa75a1c4; WORD $0x964c; BYTE $0x20 // vpsubd ymm1, ymm1, ymmword ptr [rsi + 4*r10 + 32] LONG $0xfa6da1c4; WORD $0x9654; BYTE $0x40 // vpsubd ymm2, ymm2, ymmword ptr [rsi + 4*r10 + 64] LONG $0xfa65a1c4; WORD $0x965c; BYTE $0x60 // vpsubd ymm3, ymm3, ymmword ptr [rsi + 4*r10 + 96] LONG $0x7f7ea1c4; WORD $0x9204 // vmovdqu ymmword ptr [rdx + 4*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x924c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 4*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x9254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 4*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x925c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 4*r10 + 96], ymm3 LONG $0x20c28349 // add r10, 32 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB46_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB46_12 LBB46_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB46_10 LBB46_9: LONG $0x9f048b42 // mov eax, dword ptr [rdi + 4*r11] LONG $0x9e042b42 // sub eax, dword ptr [rsi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB46_9 LBB46_10: LONG $0x03f98349 // cmp r9, 3 JB LBB46_12 LBB46_11: LONG $0x9f048b42 // mov eax, dword ptr [rdi + 4*r11] LONG $0x9e042b42 // sub eax, dword ptr [rsi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax LONG $0x9f448b42; BYTE $0x04 // mov eax, dword ptr [rdi + 4*r11 + 4] LONG $0x9e442b42; BYTE $0x04 // sub eax, dword ptr [rsi + 4*r11 + 4] LONG $0x9a448942; BYTE $0x04 // mov dword ptr [rdx + 4*r11 + 4], eax LONG $0x9f448b42; BYTE $0x08 // mov eax, dword ptr [rdi + 4*r11 + 8] LONG $0x9e442b42; BYTE $0x08 // sub eax, dword ptr [rsi + 4*r11 + 8] LONG $0x9a448942; BYTE $0x08 // mov dword ptr [rdx + 4*r11 + 8], eax LONG $0x9f448b42; BYTE $0x0c // mov eax, dword ptr [rdi + 4*r11 + 12] LONG $0x9e442b42; BYTE $0x0c // sub eax, dword ptr [rsi + 4*r11 + 12] LONG $0x9a448942; BYTE $0x0c // mov dword ptr [rdx + 4*r11 + 12], eax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB46_11 LBB46_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int32_mul(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB47_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x20f88349 // cmp r8, 32 JAE LBB47_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB47_8 LBB47_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB47_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB47_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x1fe18341 // and r9d, 31 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB47_6: LONG $0x6f7ea1c4; WORD $0x9604 // vmovdqu ymm0, ymmword ptr [rsi + 4*r10] LONG $0x6f7ea1c4; WORD $0x964c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + 4*r10 + 32] LONG $0x6f7ea1c4; WORD $0x9654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + 4*r10 + 64] LONG $0x6f7ea1c4; WORD $0x965c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + 4*r10 + 96] LONG $0x407da2c4; WORD $0x9704 // vpmulld ymm0, ymm0, ymmword ptr [rdi + 4*r10] LONG $0x4075a2c4; WORD $0x974c; BYTE $0x20 // vpmulld ymm1, ymm1, ymmword ptr [rdi + 4*r10 + 32] LONG $0x406da2c4; WORD $0x9754; BYTE $0x40 // vpmulld ymm2, ymm2, ymmword ptr [rdi + 4*r10 + 64] LONG $0x4065a2c4; WORD $0x975c; BYTE $0x60 // vpmulld ymm3, ymm3, ymmword ptr [rdi + 4*r10 + 96] LONG $0x7f7ea1c4; WORD $0x9204 // vmovdqu ymmword ptr [rdx + 4*r10], ymm0 LONG $0x7f7ea1c4; WORD $0x924c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 4*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0x9254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 4*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0x925c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 4*r10 + 96], ymm3 LONG $0x20c28349 // add r10, 32 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB47_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB47_12 LBB47_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB47_10 LBB47_9: LONG $0x9e048b42 // mov eax, dword ptr [rsi + 4*r11] LONG $0x04af0f42; BYTE $0x9f // imul eax, dword ptr [rdi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB47_9 LBB47_10: LONG $0x03f98349 // cmp r9, 3 JB LBB47_12 LBB47_11: LONG $0x9e048b42 // mov eax, dword ptr [rsi + 4*r11] LONG $0x04af0f42; BYTE $0x9f // imul eax, dword ptr [rdi + 4*r11] LONG $0x9a048942 // mov dword ptr [rdx + 4*r11], eax LONG $0x9e448b42; BYTE $0x04 // mov eax, dword ptr [rsi + 4*r11 + 4] LONG $0x44af0f42; WORD $0x049f // imul eax, dword ptr [rdi + 4*r11 + 4] LONG $0x9a448942; BYTE $0x04 // mov dword ptr [rdx + 4*r11 + 4], eax LONG $0x9e448b42; BYTE $0x08 // mov eax, dword ptr [rsi + 4*r11 + 8] LONG $0x44af0f42; WORD $0x089f // imul eax, dword ptr [rdi + 4*r11 + 8] LONG $0x9a448942; BYTE $0x08 // mov dword ptr [rdx + 4*r11 + 8], eax LONG $0x9e448b42; BYTE $0x0c // mov eax, dword ptr [rsi + 4*r11 + 12] LONG $0x44af0f42; WORD $0x0c9f // imul eax, dword ptr [rdi + 4*r11 + 12] LONG $0x9a448942; BYTE $0x0c // mov dword ptr [rdx + 4*r11 + 12], eax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB47_11 LBB47_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int32_div(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp WORD $0x5741 // push r15 WORD $0x5641 // push r14 BYTE $0x53 // push rbx LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB48_12 WORD $0x8949; BYTE $0xd0 // mov r8, rdx WORD $0x8941; BYTE $0xcb // mov r11d, ecx LONG $0x08fb8349 // cmp r11, 8 JAE LBB48_3 WORD $0x3145; BYTE $0xff // xor r15d, r15d JMP LBB48_8 LBB48_3: WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xff // xor r15d, r15d LONG $0x20f88348 // cmp rax, 32 JB LBB48_8 WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x20f88348 // cmp rax, 32 JB LBB48_8 WORD $0x8941; BYTE $0xce // mov r14d, ecx LONG $0x07e68341 // and r14d, 7 WORD $0x894d; BYTE $0xdf // mov r15, r11 WORD $0x294d; BYTE $0xf7 // sub r15, r14 WORD $0xdb31 // xor ebx, ebx LBB48_6: LONG $0x9f0c8b44 // mov r9d, dword ptr [rdi + 4*rbx] LONG $0x049f448b // mov eax, dword ptr [rdi + 4*rbx + 4] BYTE $0x99 // cdq LONG $0x049e7cf7 // idiv dword ptr [rsi + 4*rbx + 4] WORD $0x8941; BYTE $0xc2 // mov r10d, eax WORD $0x8944; BYTE $0xc8 // mov eax, r9d BYTE $0x99 // cdq WORD $0x3cf7; BYTE $0x9e // idiv dword ptr [rsi + 4*rbx] LONG $0xc06ef9c5 // vmovd xmm0, eax LONG $0x2279c3c4; WORD $0x01c2 // vpinsrd xmm0, xmm0, r10d, 1 LONG $0x089f448b // mov eax, dword ptr [rdi + 4*rbx + 8] BYTE $0x99 // cdq LONG $0x089e7cf7 // idiv dword ptr [rsi + 4*rbx + 8] LONG $0x2279e3c4; WORD $0x02c0 // vpinsrd xmm0, xmm0, eax, 2 LONG $0x0c9f448b // mov eax, dword ptr [rdi + 4*rbx + 12] BYTE $0x99 // cdq LONG $0x0c9e7cf7 // idiv dword ptr [rsi + 4*rbx + 12] LONG $0x2279e3c4; WORD $0x03c0 // vpinsrd xmm0, xmm0, eax, 3 LONG $0x149f448b // mov eax, dword ptr [rdi + 4*rbx + 20] BYTE $0x99 // cdq LONG $0x149e7cf7 // idiv dword ptr [rsi + 4*rbx + 20] WORD $0x8941; BYTE $0xc1 // mov r9d, eax LONG $0x109f448b // mov eax, dword ptr [rdi + 4*rbx + 16] BYTE $0x99 // cdq LONG $0x109e7cf7 // idiv dword ptr [rsi + 4*rbx + 16] LONG $0xc86ef9c5 // vmovd xmm1, eax LONG $0x2271c3c4; WORD $0x01c9 // vpinsrd xmm1, xmm1, r9d, 1 LONG $0x189f448b // mov eax, dword ptr [rdi + 4*rbx + 24] BYTE $0x99 // cdq LONG $0x189e7cf7 // idiv dword ptr [rsi + 4*rbx + 24] LONG $0x2271e3c4; WORD $0x02c8 // vpinsrd xmm1, xmm1, eax, 2 LONG $0x1c9f448b // mov eax, dword ptr [rdi + 4*rbx + 28] BYTE $0x99 // cdq LONG $0x1c9e7cf7 // idiv dword ptr [rsi + 4*rbx + 28] LONG $0x2271e3c4; WORD $0x03c8 // vpinsrd xmm1, xmm1, eax, 3 LONG $0x7f7ac1c4; WORD $0x984c; BYTE $0x10 // vmovdqu xmmword ptr [r8 + 4*rbx + 16], xmm1 LONG $0x7f7ac1c4; WORD $0x9804 // vmovdqu xmmword ptr [r8 + 4*rbx], xmm0 LONG $0x08c38348 // add rbx, 8 WORD $0x3949; BYTE $0xdf // cmp r15, rbx JNE LBB48_6 WORD $0x854d; BYTE $0xf6 // test r14, r14 JE LBB48_12 LBB48_8: WORD $0x2944; BYTE $0xf9 // sub ecx, r15d LONG $0x015f8d49 // lea rbx, [r15 + 1] WORD $0xc1f6; BYTE $0x01 // test cl, 1 JE LBB48_10 LONG $0xbf048b42 // mov eax, dword ptr [rdi + 4*r15] BYTE $0x99 // cdq LONG $0xbe3cf742 // idiv dword ptr [rsi + 4*r15] LONG $0xb8048943 // mov dword ptr [r8 + 4*r15], eax WORD $0x8949; BYTE $0xdf // mov r15, rbx LBB48_10: WORD $0x3949; BYTE $0xdb // cmp r11, rbx JE LBB48_12 LBB48_11: LONG $0xbf048b42 // mov eax, dword ptr [rdi + 4*r15] BYTE $0x99 // cdq LONG $0xbe3cf742 // idiv dword ptr [rsi + 4*r15] LONG $0xb8048943 // mov dword ptr [r8 + 4*r15], eax LONG $0xbf448b42; BYTE $0x04 // mov eax, dword ptr [rdi + 4*r15 + 4] BYTE $0x99 // cdq LONG $0xbe7cf742; BYTE $0x04 // idiv dword ptr [rsi + 4*r15 + 4] LONG $0xb8448943; BYTE $0x04 // mov dword ptr [r8 + 4*r15 + 4], eax LONG $0x02c78349 // add r15, 2 WORD $0x394d; BYTE $0xfb // cmp r11, r15 JNE LBB48_11 LBB48_12: LONG $0xe8658d48 // lea rsp, [rbp - 24] BYTE $0x5b // pop rbx WORD $0x5e41 // pop r14 WORD $0x5f41 // pop r15 BYTE $0x5d // pop rbp BYTE $0xc3 // ret TEXT ·_int64_sum(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xd285 // test edx, edx JLE LBB49_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB49_4 WORD $0xc931 // xor ecx, ecx WORD $0xc031 // xor eax, eax JMP LBB49_7 LBB49_1: WORD $0xc031 // xor eax, eax JMP LBB49_8 LBB49_4: WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0xc0eff9c5 // vpxor xmm0, xmm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc9eff1c5 // vpxor xmm1, xmm1, xmm1 LONG $0xd2efe9c5 // vpxor xmm2, xmm2, xmm2 LONG $0xdbefe1c5 // vpxor xmm3, xmm3, xmm3 LBB49_5: LONG $0x04d4fdc5; BYTE $0xc7 // vpaddq ymm0, ymm0, ymmword ptr [rdi + 8*rax] LONG $0x4cd4f5c5; WORD $0x20c7 // vpaddq ymm1, ymm1, ymmword ptr [rdi + 8*rax + 32] LONG $0x54d4edc5; WORD $0x40c7 // vpaddq ymm2, ymm2, ymmword ptr [rdi + 8*rax + 64] LONG $0x5cd4e5c5; WORD $0x60c7 // vpaddq ymm3, ymm3, ymmword ptr [rdi + 8*rax + 96] LONG $0x10c08348 // add rax, 16 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB49_5 LONG $0xc0d4f5c5 // vpaddq ymm0, ymm1, ymm0 LONG $0xc0d4edc5 // vpaddq ymm0, ymm2, ymm0 LONG $0xc0d4e5c5 // vpaddq ymm0, ymm3, ymm0 LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LONG $0xc1d4f9c5 // vpaddq xmm0, xmm0, xmm1 LONG $0xc870f9c5; BYTE $0xee // vpshufd xmm1, xmm0, 238 LONG $0xc1d4f9c5 // vpaddq xmm0, xmm0, xmm1 LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB49_8 LBB49_7: LONG $0xcf040348 // add rax, qword ptr [rdi + 8*rcx] WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB49_7 LBB49_8: WORD $0x8948; BYTE $0x06 // mov qword ptr [rsi], rax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int64_min(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0x8b48; BYTE $0x07 // mov rax, qword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB50_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB50_3 WORD $0xc931 // xor ecx, ecx JMP LBB50_6 LBB50_3: WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LBB50_4: LONG $0x246ffec5; BYTE $0xc7 // vmovdqu ymm4, ymmword ptr [rdi + 8*rax] LONG $0x6c6ffec5; WORD $0x20c7 // vmovdqu ymm5, ymmword ptr [rdi + 8*rax + 32] LONG $0x746ffec5; WORD $0x40c7 // vmovdqu ymm6, ymmword ptr [rdi + 8*rax + 64] LONG $0x377de2c4; BYTE $0xfc // vpcmpgtq ymm7, ymm0, ymm4 LONG $0x4b7de3c4; WORD $0x70c4 // vblendvpd ymm0, ymm0, ymm4, ymm7 LONG $0x646ffec5; WORD $0x60c7 // vmovdqu ymm4, ymmword ptr [rdi + 8*rax + 96] LONG $0x3765e2c4; BYTE $0xfd // vpcmpgtq ymm7, ymm3, ymm5 LONG $0x4b65e3c4; WORD $0x70dd // vblendvpd ymm3, ymm3, ymm5, ymm7 LONG $0x376de2c4; BYTE $0xee // vpcmpgtq ymm5, ymm2, ymm6 LONG $0x4b6de3c4; WORD $0x50d6 // vblendvpd ymm2, ymm2, ymm6, ymm5 LONG $0x3775e2c4; BYTE $0xec // vpcmpgtq ymm5, ymm1, ymm4 LONG $0x4b75e3c4; WORD $0x50cc // vblendvpd ymm1, ymm1, ymm4, ymm5 LONG $0x10c08348 // add rax, 16 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB50_4 LONG $0x3765e2c4; BYTE $0xe0 // vpcmpgtq ymm4, ymm3, ymm0 LONG $0x4b65e3c4; WORD $0x40c0 // vblendvpd ymm0, ymm3, ymm0, ymm4 LONG $0x376de2c4; BYTE $0xd8 // vpcmpgtq ymm3, ymm2, ymm0 LONG $0x4b6de3c4; WORD $0x30c0 // vblendvpd ymm0, ymm2, ymm0, ymm3 LONG $0x3775e2c4; BYTE $0xd0 // vpcmpgtq ymm2, ymm1, ymm0 LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0 LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 LONG $0x0479e3c4; WORD $0xeec8 // vpermilps xmm1, xmm0, 238 LONG $0x3771e2c4; BYTE $0xd0 // vpcmpgtq xmm2, xmm1, xmm0 LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB50_7 LBB50_6: LONG $0xcf148b48 // mov rdx, qword ptr [rdi + 8*rcx] WORD $0x3948; BYTE $0xc2 // cmp rdx, rax LONG $0xc24c0f48 // cmovl rax, rdx WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB50_6 LBB50_7: WORD $0x8948; BYTE $0x06 // mov qword ptr [rsi], rax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int64_max(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0x8b48; BYTE $0x07 // mov rax, qword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB51_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB51_3 WORD $0xc931 // xor ecx, ecx JMP LBB51_6 LBB51_3: WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xd86ffdc5 // vmovdqa ymm3, ymm0 LONG $0xd06ffdc5 // vmovdqa ymm2, ymm0 LONG $0xc86ffdc5 // vmovdqa ymm1, ymm0 LBB51_4: LONG $0x246ffec5; BYTE $0xc7 // vmovdqu ymm4, ymmword ptr [rdi + 8*rax] LONG $0x6c6ffec5; WORD $0x20c7 // vmovdqu ymm5, ymmword ptr [rdi + 8*rax + 32] LONG $0x746ffec5; WORD $0x40c7 // vmovdqu ymm6, ymmword ptr [rdi + 8*rax + 64] LONG $0x375de2c4; BYTE $0xf8 // vpcmpgtq ymm7, ymm4, ymm0 LONG $0x4b7de3c4; WORD $0x70c4 // vblendvpd ymm0, ymm0, ymm4, ymm7 LONG $0x646ffec5; WORD $0x60c7 // vmovdqu ymm4, ymmword ptr [rdi + 8*rax + 96] LONG $0x3755e2c4; BYTE $0xfb // vpcmpgtq ymm7, ymm5, ymm3 LONG $0x4b65e3c4; WORD $0x70dd // vblendvpd ymm3, ymm3, ymm5, ymm7 LONG $0x374de2c4; BYTE $0xea // vpcmpgtq ymm5, ymm6, ymm2 LONG $0x4b6de3c4; WORD $0x50d6 // vblendvpd ymm2, ymm2, ymm6, ymm5 LONG $0x375de2c4; BYTE $0xe9 // vpcmpgtq ymm5, ymm4, ymm1 LONG $0x4b75e3c4; WORD $0x50cc // vblendvpd ymm1, ymm1, ymm4, ymm5 LONG $0x10c08348 // add rax, 16 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB51_4 LONG $0x377de2c4; BYTE $0xe3 // vpcmpgtq ymm4, ymm0, ymm3 LONG $0x4b65e3c4; WORD $0x40c0 // vblendvpd ymm0, ymm3, ymm0, ymm4 LONG $0x377de2c4; BYTE $0xda // vpcmpgtq ymm3, ymm0, ymm2 LONG $0x4b6de3c4; WORD $0x30c0 // vblendvpd ymm0, ymm2, ymm0, ymm3 LONG $0x377de2c4; BYTE $0xd1 // vpcmpgtq ymm2, ymm0, ymm1 LONG $0x4b75e3c4; WORD $0x20c0 // vblendvpd ymm0, ymm1, ymm0, ymm2 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 LONG $0x3779e2c4; BYTE $0xd1 // vpcmpgtq xmm2, xmm0, xmm1 LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 LONG $0x0479e3c4; WORD $0xeec8 // vpermilps xmm1, xmm0, 238 LONG $0x3779e2c4; BYTE $0xd1 // vpcmpgtq xmm2, xmm0, xmm1 LONG $0x4b71e3c4; WORD $0x20c0 // vblendvpd xmm0, xmm1, xmm0, xmm2 LONG $0x7ef9e1c4; BYTE $0xc0 // vmovq rax, xmm0 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB51_7 LBB51_6: LONG $0xcf148b48 // mov rdx, qword ptr [rdi + 8*rcx] WORD $0x3948; BYTE $0xc2 // cmp rdx, rax LONG $0xc24f0f48 // cmovg rax, rdx WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB51_6 LBB51_7: WORD $0x8948; BYTE $0x06 // mov qword ptr [rsi], rax WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int64_add(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB52_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB52_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB52_8 LBB52_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB52_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB52_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x0fe18341 // and r9d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB52_6: LONG $0x6f7ea1c4; WORD $0xd604 // vmovdqu ymm0, ymmword ptr [rsi + 8*r10] LONG $0x6f7ea1c4; WORD $0xd64c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rsi + 8*r10 + 32] LONG $0x6f7ea1c4; WORD $0xd654; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rsi + 8*r10 + 64] LONG $0x6f7ea1c4; WORD $0xd65c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rsi + 8*r10 + 96] LONG $0xd47da1c4; WORD $0xd704 // vpaddq ymm0, ymm0, ymmword ptr [rdi + 8*r10] LONG $0xd475a1c4; WORD $0xd74c; BYTE $0x20 // vpaddq ymm1, ymm1, ymmword ptr [rdi + 8*r10 + 32] LONG $0xd46da1c4; WORD $0xd754; BYTE $0x40 // vpaddq ymm2, ymm2, ymmword ptr [rdi + 8*r10 + 64] LONG $0xd465a1c4; WORD $0xd75c; BYTE $0x60 // vpaddq ymm3, ymm3, ymmword ptr [rdi + 8*r10 + 96] LONG $0x7f7ea1c4; WORD $0xd204 // vmovdqu ymmword ptr [rdx + 8*r10], ymm0 LONG $0x7f7ea1c4; WORD $0xd24c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 8*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0xd254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 8*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0xd25c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 8*r10 + 96], ymm3 LONG $0x10c28349 // add r10, 16 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB52_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB52_12 LBB52_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB52_10 LBB52_9: LONG $0xde048b4a // mov rax, qword ptr [rsi + 8*r11] LONG $0xdf04034a // add rax, qword ptr [rdi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB52_9 LBB52_10: LONG $0x03f98349 // cmp r9, 3 JB LBB52_12 LBB52_11: LONG $0xde048b4a // mov rax, qword ptr [rsi + 8*r11] LONG $0xdf04034a // add rax, qword ptr [rdi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax LONG $0xde448b4a; BYTE $0x08 // mov rax, qword ptr [rsi + 8*r11 + 8] LONG $0xdf44034a; BYTE $0x08 // add rax, qword ptr [rdi + 8*r11 + 8] LONG $0xda44894a; BYTE $0x08 // mov qword ptr [rdx + 8*r11 + 8], rax LONG $0xde448b4a; BYTE $0x10 // mov rax, qword ptr [rsi + 8*r11 + 16] LONG $0xdf44034a; BYTE $0x10 // add rax, qword ptr [rdi + 8*r11 + 16] LONG $0xda44894a; BYTE $0x10 // mov qword ptr [rdx + 8*r11 + 16], rax LONG $0xde448b4a; BYTE $0x18 // mov rax, qword ptr [rsi + 8*r11 + 24] LONG $0xdf44034a; BYTE $0x18 // add rax, qword ptr [rdi + 8*r11 + 24] LONG $0xda44894a; BYTE $0x18 // mov qword ptr [rdx + 8*r11 + 24], rax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB52_11 LBB52_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int64_sub(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB53_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB53_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB53_8 LBB53_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB53_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB53_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x0fe18341 // and r9d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB53_6: LONG $0x6f7ea1c4; WORD $0xd704 // vmovdqu ymm0, ymmword ptr [rdi + 8*r10] LONG $0x6f7ea1c4; WORD $0xd74c; BYTE $0x20 // vmovdqu ymm1, ymmword ptr [rdi + 8*r10 + 32] LONG $0x6f7ea1c4; WORD $0xd754; BYTE $0x40 // vmovdqu ymm2, ymmword ptr [rdi + 8*r10 + 64] LONG $0x6f7ea1c4; WORD $0xd75c; BYTE $0x60 // vmovdqu ymm3, ymmword ptr [rdi + 8*r10 + 96] LONG $0xfb7da1c4; WORD $0xd604 // vpsubq ymm0, ymm0, ymmword ptr [rsi + 8*r10] LONG $0xfb75a1c4; WORD $0xd64c; BYTE $0x20 // vpsubq ymm1, ymm1, ymmword ptr [rsi + 8*r10 + 32] LONG $0xfb6da1c4; WORD $0xd654; BYTE $0x40 // vpsubq ymm2, ymm2, ymmword ptr [rsi + 8*r10 + 64] LONG $0xfb65a1c4; WORD $0xd65c; BYTE $0x60 // vpsubq ymm3, ymm3, ymmword ptr [rsi + 8*r10 + 96] LONG $0x7f7ea1c4; WORD $0xd204 // vmovdqu ymmword ptr [rdx + 8*r10], ymm0 LONG $0x7f7ea1c4; WORD $0xd24c; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 8*r10 + 32], ymm1 LONG $0x7f7ea1c4; WORD $0xd254; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 8*r10 + 64], ymm2 LONG $0x7f7ea1c4; WORD $0xd25c; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 8*r10 + 96], ymm3 LONG $0x10c28349 // add r10, 16 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB53_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB53_12 LBB53_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB53_10 LBB53_9: LONG $0xdf048b4a // mov rax, qword ptr [rdi + 8*r11] LONG $0xde042b4a // sub rax, qword ptr [rsi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB53_9 LBB53_10: LONG $0x03f98349 // cmp r9, 3 JB LBB53_12 LBB53_11: LONG $0xdf048b4a // mov rax, qword ptr [rdi + 8*r11] LONG $0xde042b4a // sub rax, qword ptr [rsi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax LONG $0xdf448b4a; BYTE $0x08 // mov rax, qword ptr [rdi + 8*r11 + 8] LONG $0xde442b4a; BYTE $0x08 // sub rax, qword ptr [rsi + 8*r11 + 8] LONG $0xda44894a; BYTE $0x08 // mov qword ptr [rdx + 8*r11 + 8], rax LONG $0xdf448b4a; BYTE $0x10 // mov rax, qword ptr [rdi + 8*r11 + 16] LONG $0xde442b4a; BYTE $0x10 // sub rax, qword ptr [rsi + 8*r11 + 16] LONG $0xda44894a; BYTE $0x10 // mov qword ptr [rdx + 8*r11 + 16], rax LONG $0xdf448b4a; BYTE $0x18 // mov rax, qword ptr [rdi + 8*r11 + 24] LONG $0xde442b4a; BYTE $0x18 // sub rax, qword ptr [rsi + 8*r11 + 24] LONG $0xda44894a; BYTE $0x18 // mov qword ptr [rdx + 8*r11 + 24], rax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB53_11 LBB53_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int64_mul(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB54_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB54_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB54_8 LBB54_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB54_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB54_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x0fe18341 // and r9d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB54_6: LONG $0x6f7ea1c4; WORD $0xd70c // vmovdqu ymm1, ymmword ptr [rdi + 8*r10] LONG $0x6f7ea1c4; WORD $0xd754; BYTE $0x20 // vmovdqu ymm2, ymmword ptr [rdi + 8*r10 + 32] LONG $0x6f7ea1c4; WORD $0xd75c; BYTE $0x40 // vmovdqu ymm3, ymmword ptr [rdi + 8*r10 + 64] LONG $0x6f7ea1c4; WORD $0xd744; BYTE $0x60 // vmovdqu ymm0, ymmword ptr [rdi + 8*r10 + 96] LONG $0x6f7ea1c4; WORD $0xd624 // vmovdqu ymm4, ymmword ptr [rsi + 8*r10] LONG $0x6f7ea1c4; WORD $0xd66c; BYTE $0x20 // vmovdqu ymm5, ymmword ptr [rsi + 8*r10 + 32] LONG $0x6f7ea1c4; WORD $0xd674; BYTE $0x40 // vmovdqu ymm6, ymmword ptr [rsi + 8*r10 + 64] LONG $0x6f7ea1c4; WORD $0xd67c; BYTE $0x60 // vmovdqu ymm7, ymmword ptr [rsi + 8*r10 + 96] LONG $0xd473bdc5; BYTE $0x20 // vpsrlq ymm8, ymm4, 32 LONG $0xc1f43dc5 // vpmuludq ymm8, ymm8, ymm1 LONG $0xd173b5c5; BYTE $0x20 // vpsrlq ymm9, ymm1, 32 LONG $0xccf435c5 // vpmuludq ymm9, ymm9, ymm4 LONG $0xd43541c4; BYTE $0xc0 // vpaddq ymm8, ymm9, ymm8 LONG $0x733dc1c4; WORD $0x20f0 // vpsllq ymm8, ymm8, 32 LONG $0xc9f4ddc5 // vpmuludq ymm1, ymm4, ymm1 LONG $0xc9d4bdc5 // vpaddq ymm1, ymm8, ymm1 LONG $0xd573ddc5; BYTE $0x20 // vpsrlq ymm4, ymm5, 32 LONG $0xe2f4ddc5 // vpmuludq ymm4, ymm4, ymm2 LONG $0xd273bdc5; BYTE $0x20 // vpsrlq ymm8, ymm2, 32 LONG $0xc5f43dc5 // vpmuludq ymm8, ymm8, ymm5 LONG $0xe4d4bdc5 // vpaddq ymm4, ymm8, ymm4 LONG $0xf473ddc5; BYTE $0x20 // vpsllq ymm4, ymm4, 32 LONG $0xd2f4d5c5 // vpmuludq ymm2, ymm5, ymm2 LONG $0xd4d4edc5 // vpaddq ymm2, ymm2, ymm4 LONG $0xd673ddc5; BYTE $0x20 // vpsrlq ymm4, ymm6, 32 LONG $0xe3f4ddc5 // vpmuludq ymm4, ymm4, ymm3 LONG $0xd373d5c5; BYTE $0x20 // vpsrlq ymm5, ymm3, 32 LONG $0xedf4cdc5 // vpmuludq ymm5, ymm6, ymm5 LONG $0xe4d4d5c5 // vpaddq ymm4, ymm5, ymm4 LONG $0xf473ddc5; BYTE $0x20 // vpsllq ymm4, ymm4, 32 LONG $0xdbf4cdc5 // vpmuludq ymm3, ymm6, ymm3 LONG $0xdcd4e5c5 // vpaddq ymm3, ymm3, ymm4 LONG $0xd773ddc5; BYTE $0x20 // vpsrlq ymm4, ymm7, 32 LONG $0xe0f4ddc5 // vpmuludq ymm4, ymm4, ymm0 LONG $0xd073d5c5; BYTE $0x20 // vpsrlq ymm5, ymm0, 32 LONG $0xedf4c5c5 // vpmuludq ymm5, ymm7, ymm5 LONG $0xe4d4d5c5 // vpaddq ymm4, ymm5, ymm4 LONG $0xf473ddc5; BYTE $0x20 // vpsllq ymm4, ymm4, 32 LONG $0xc0f4c5c5 // vpmuludq ymm0, ymm7, ymm0 LONG $0xc4d4fdc5 // vpaddq ymm0, ymm0, ymm4 LONG $0x7f7ea1c4; WORD $0xd20c // vmovdqu ymmword ptr [rdx + 8*r10], ymm1 LONG $0x7f7ea1c4; WORD $0xd254; BYTE $0x20 // vmovdqu ymmword ptr [rdx + 8*r10 + 32], ymm2 LONG $0x7f7ea1c4; WORD $0xd25c; BYTE $0x40 // vmovdqu ymmword ptr [rdx + 8*r10 + 64], ymm3 LONG $0x7f7ea1c4; WORD $0xd244; BYTE $0x60 // vmovdqu ymmword ptr [rdx + 8*r10 + 96], ymm0 LONG $0x10c28349 // add r10, 16 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB54_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB54_12 LBB54_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB54_10 LBB54_9: LONG $0xde048b4a // mov rax, qword ptr [rsi + 8*r11] LONG $0x04af0f4a; BYTE $0xdf // imul rax, qword ptr [rdi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB54_9 LBB54_10: LONG $0x03f98349 // cmp r9, 3 JB LBB54_12 LBB54_11: LONG $0xde048b4a // mov rax, qword ptr [rsi + 8*r11] LONG $0x04af0f4a; BYTE $0xdf // imul rax, qword ptr [rdi + 8*r11] LONG $0xda04894a // mov qword ptr [rdx + 8*r11], rax LONG $0xde448b4a; BYTE $0x08 // mov rax, qword ptr [rsi + 8*r11 + 8] LONG $0x44af0f4a; WORD $0x08df // imul rax, qword ptr [rdi + 8*r11 + 8] LONG $0xda44894a; BYTE $0x08 // mov qword ptr [rdx + 8*r11 + 8], rax LONG $0xde448b4a; BYTE $0x10 // mov rax, qword ptr [rsi + 8*r11 + 16] LONG $0x44af0f4a; WORD $0x10df // imul rax, qword ptr [rdi + 8*r11 + 16] LONG $0xda44894a; BYTE $0x10 // mov qword ptr [rdx + 8*r11 + 16], rax LONG $0xde448b4a; BYTE $0x18 // mov rax, qword ptr [rsi + 8*r11 + 24] LONG $0x44af0f4a; WORD $0x18df // imul rax, qword ptr [rdi + 8*r11 + 24] LONG $0xda44894a; BYTE $0x18 // mov qword ptr [rdx + 8*r11 + 24], rax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB54_11 LBB54_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_int64_div(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp WORD $0x5641 // push r14 BYTE $0x53 // push rbx LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB55_21 WORD $0x8949; BYTE $0xd0 // mov r8, rdx WORD $0x8941; BYTE $0xca // mov r10d, ecx LONG $0x04fa8349 // cmp r10, 4 JAE LBB55_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB55_8 LBB55_3: WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x20f88348 // cmp rax, 32 JB LBB55_8 WORD $0x894c; BYTE $0xc0 // mov rax, r8 WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x20f88348 // cmp rax, 32 JB LBB55_8 WORD $0x8941; BYTE $0xce // mov r14d, ecx LONG $0x03e68341 // and r14d, 3 WORD $0x894d; BYTE $0xd3 // mov r11, r10 WORD $0x294d; BYTE $0xf3 // sub r11, r14 WORD $0xdb31 // xor ebx, ebx LBB55_6: LONG $0xdf0c8b4c // mov r9, qword ptr [rdi + 8*rbx] LONG $0xdf448b48; BYTE $0x08 // mov rax, qword ptr [rdi + 8*rbx + 8] WORD $0x9948 // cqo LONG $0xde7cf748; BYTE $0x08 // idiv qword ptr [rsi + 8*rbx + 8] LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax WORD $0x894c; BYTE $0xc8 // mov rax, r9 WORD $0x9948 // cqo LONG $0xde3cf748 // idiv qword ptr [rsi + 8*rbx] LONG $0x6ef9e1c4; BYTE $0xc8 // vmovq xmm1, rax LONG $0xc06cf1c5 // vpunpcklqdq xmm0, xmm1, xmm0 LONG $0xdf448b48; BYTE $0x18 // mov rax, qword ptr [rdi + 8*rbx + 24] WORD $0x9948 // cqo LONG $0xde7cf748; BYTE $0x18 // idiv qword ptr [rsi + 8*rbx + 24] WORD $0x8949; BYTE $0xc1 // mov r9, rax LONG $0xdf448b48; BYTE $0x10 // mov rax, qword ptr [rdi + 8*rbx + 16] WORD $0x9948 // cqo LONG $0xde7cf748; BYTE $0x10 // idiv qword ptr [rsi + 8*rbx + 16] LONG $0x6ef9c1c4; BYTE $0xc9 // vmovq xmm1, r9 LONG $0x6ef9e1c4; BYTE $0xd0 // vmovq xmm2, rax LONG $0xc96ce9c5 // vpunpcklqdq xmm1, xmm2, xmm1 LONG $0x7f7ac1c4; WORD $0xd84c; BYTE $0x10 // vmovdqu xmmword ptr [r8 + 8*rbx + 16], xmm1 LONG $0x7f7ac1c4; WORD $0xd804 // vmovdqu xmmword ptr [r8 + 8*rbx], xmm0 LONG $0x04c38348 // add rbx, 4 WORD $0x3949; BYTE $0xdb // cmp r11, rbx JNE LBB55_6 WORD $0x854d; BYTE $0xf6 // test r14, r14 JE LBB55_21 LBB55_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d LONG $0x015b8d49 // lea rbx, [r11 + 1] WORD $0xc1f6; BYTE $0x01 // test cl, 1 JE LBB55_13 LONG $0xdf048b4a // mov rax, qword ptr [rdi + 8*r11] LONG $0xde0c8b4a // mov rcx, qword ptr [rsi + 8*r11] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB55_10 WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx JMP LBB55_12 LBB55_10: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB55_12: LONG $0xd804894b // mov qword ptr [r8 + 8*r11], rax WORD $0x8949; BYTE $0xdb // mov r11, rbx LBB55_13: WORD $0x3949; BYTE $0xda // cmp r10, rbx JNE LBB55_14 JMP LBB55_21 LBB55_19: WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx LONG $0xd844894b; BYTE $0x08 // mov qword ptr [r8 + 8*r11 + 8], rax LONG $0x02c38349 // add r11, 2 WORD $0x394d; BYTE $0xda // cmp r10, r11 JE LBB55_21 LBB55_14: LONG $0xdf048b4a // mov rax, qword ptr [rdi + 8*r11] LONG $0xde0c8b4a // mov rcx, qword ptr [rsi + 8*r11] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB55_15 WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx JMP LBB55_17 LBB55_15: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB55_17: LONG $0xd804894b // mov qword ptr [r8 + 8*r11], rax LONG $0xdf448b4a; BYTE $0x08 // mov rax, qword ptr [rdi + 8*r11 + 8] LONG $0xde4c8b4a; BYTE $0x08 // mov rcx, qword ptr [rsi + 8*r11 + 8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JNE LBB55_19 WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LONG $0xd844894b; BYTE $0x08 // mov qword ptr [r8 + 8*r11 + 8], rax LONG $0x02c38349 // add r11, 2 WORD $0x394d; BYTE $0xda // cmp r10, r11 JNE LBB55_14 LBB55_21: LONG $0xf0658d48 // lea rsp, [rbp - 16] BYTE $0x5b // pop rbx WORD $0x5e41 // pop r14 BYTE $0x5d // pop rbp BYTE $0xc3 // ret TEXT ·_float32_sum(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xd285 // test edx, edx JLE LBB56_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x20f88349 // cmp r8, 32 JAE LBB56_4 LONG $0xc057f8c5 // vxorps xmm0, xmm0, xmm0 WORD $0xc931 // xor ecx, ecx JMP LBB56_7 LBB56_1: LONG $0xc057f8c5 // vxorps xmm0, xmm0, xmm0 JMP LBB56_8 LBB56_4: WORD $0xe283; BYTE $0x1f // and edx, 31 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0xc057f8c5 // vxorps xmm0, xmm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc957f0c5 // vxorps xmm1, xmm1, xmm1 LONG $0xd257e8c5 // vxorps xmm2, xmm2, xmm2 LONG $0xdb57e0c5 // vxorps xmm3, xmm3, xmm3 LBB56_5: LONG $0x0458fcc5; BYTE $0x87 // vaddps ymm0, ymm0, ymmword ptr [rdi + 4*rax] LONG $0x4c58f4c5; WORD $0x2087 // vaddps ymm1, ymm1, ymmword ptr [rdi + 4*rax + 32] LONG $0x5458ecc5; WORD $0x4087 // vaddps ymm2, ymm2, ymmword ptr [rdi + 4*rax + 64] LONG $0x5c58e4c5; WORD $0x6087 // vaddps ymm3, ymm3, ymmword ptr [rdi + 4*rax + 96] LONG $0x20c08348 // add rax, 32 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB56_5 LONG $0xc058f4c5 // vaddps ymm0, ymm1, ymm0 LONG $0xc058ecc5 // vaddps ymm0, ymm2, ymm0 LONG $0xc058e4c5 // vaddps ymm0, ymm3, ymm0 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 LONG $0xc158f8c5 // vaddps xmm0, xmm0, xmm1 LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd xmm1, xmm0, 1 LONG $0xc158f8c5 // vaddps xmm0, xmm0, xmm1 LONG $0xc816fac5 // vmovshdup xmm1, xmm0 LONG $0xc158fac5 // vaddss xmm0, xmm0, xmm1 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB56_8 LBB56_7: LONG $0x0458fac5; BYTE $0x8f // vaddss xmm0, xmm0, dword ptr [rdi + 4*rcx] WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB56_7 LBB56_8: LONG $0x0611fac5 // vmovss dword ptr [rsi], xmm0 WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float32_min(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 LONG $0x0710fac5 // vmovss xmm0, dword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB57_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x20f88349 // cmp r8, 32 JAE LBB57_3 WORD $0xc931 // xor ecx, ecx JMP LBB57_6 LBB57_3: WORD $0xe283; BYTE $0x1f // and edx, 31 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0x187de2c4; BYTE $0xc0 // vbroadcastss ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc828fcc5 // vmovaps ymm1, ymm0 LONG $0xd028fcc5 // vmovaps ymm2, ymm0 LONG $0xd828fcc5 // vmovaps ymm3, ymm0 LBB57_4: LONG $0x045dfcc5; BYTE $0x87 // vminps ymm0, ymm0, ymmword ptr [rdi + 4*rax] LONG $0x4c5df4c5; WORD $0x2087 // vminps ymm1, ymm1, ymmword ptr [rdi + 4*rax + 32] LONG $0x545decc5; WORD $0x4087 // vminps ymm2, ymm2, ymmword ptr [rdi + 4*rax + 64] LONG $0x5c5de4c5; WORD $0x6087 // vminps ymm3, ymm3, ymmword ptr [rdi + 4*rax + 96] LONG $0x20c08348 // add rax, 32 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB57_4 LONG $0xc15dfcc5 // vminps ymm0, ymm0, ymm1 LONG $0xc25dfcc5 // vminps ymm0, ymm0, ymm2 LONG $0xc35dfcc5 // vminps ymm0, ymm0, ymm3 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 LONG $0xc15df8c5 // vminps xmm0, xmm0, xmm1 LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd xmm1, xmm0, 1 LONG $0xc15df8c5 // vminps xmm0, xmm0, xmm1 LONG $0xc816fac5 // vmovshdup xmm1, xmm0 LONG $0xc15dfac5 // vminss xmm0, xmm0, xmm1 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB57_7 LBB57_6: LONG $0x045dfac5; BYTE $0x8f // vminss xmm0, xmm0, dword ptr [rdi + 4*rcx] WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB57_6 LBB57_7: LONG $0x0611fac5 // vmovss dword ptr [rsi], xmm0 WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float32_max(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 LONG $0x0710fac5 // vmovss xmm0, dword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB58_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x20f88349 // cmp r8, 32 JAE LBB58_3 WORD $0xc931 // xor ecx, ecx JMP LBB58_6 LBB58_3: WORD $0xe283; BYTE $0x1f // and edx, 31 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0x187de2c4; BYTE $0xc0 // vbroadcastss ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc828fcc5 // vmovaps ymm1, ymm0 LONG $0xd028fcc5 // vmovaps ymm2, ymm0 LONG $0xd828fcc5 // vmovaps ymm3, ymm0 LBB58_4: LONG $0x045ffcc5; BYTE $0x87 // vmaxps ymm0, ymm0, ymmword ptr [rdi + 4*rax] LONG $0x4c5ff4c5; WORD $0x2087 // vmaxps ymm1, ymm1, ymmword ptr [rdi + 4*rax + 32] LONG $0x545fecc5; WORD $0x4087 // vmaxps ymm2, ymm2, ymmword ptr [rdi + 4*rax + 64] LONG $0x5c5fe4c5; WORD $0x6087 // vmaxps ymm3, ymm3, ymmword ptr [rdi + 4*rax + 96] LONG $0x20c08348 // add rax, 32 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB58_4 LONG $0xc15ffcc5 // vmaxps ymm0, ymm0, ymm1 LONG $0xc25ffcc5 // vmaxps ymm0, ymm0, ymm2 LONG $0xc35ffcc5 // vmaxps ymm0, ymm0, ymm3 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 LONG $0xc15ff8c5 // vmaxps xmm0, xmm0, xmm1 LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd xmm1, xmm0, 1 LONG $0xc15ff8c5 // vmaxps xmm0, xmm0, xmm1 LONG $0xc816fac5 // vmovshdup xmm1, xmm0 LONG $0xc15ffac5 // vmaxss xmm0, xmm0, xmm1 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB58_7 LBB58_6: LONG $0x045ffac5; BYTE $0x8f // vmaxss xmm0, xmm0, dword ptr [rdi + 4*rcx] WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB58_6 LBB58_7: LONG $0x0611fac5 // vmovss dword ptr [rsi], xmm0 WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float32_add(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB59_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x20f88349 // cmp r8, 32 JAE LBB59_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB59_8 LBB59_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB59_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB59_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x1fe18341 // and r9d, 31 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB59_6: LONG $0x107ca1c4; WORD $0x9604 // vmovups ymm0, ymmword ptr [rsi + 4*r10] LONG $0x107ca1c4; WORD $0x964c; BYTE $0x20 // vmovups ymm1, ymmword ptr [rsi + 4*r10 + 32] LONG $0x107ca1c4; WORD $0x9654; BYTE $0x40 // vmovups ymm2, ymmword ptr [rsi + 4*r10 + 64] LONG $0x107ca1c4; WORD $0x965c; BYTE $0x60 // vmovups ymm3, ymmword ptr [rsi + 4*r10 + 96] LONG $0x587ca1c4; WORD $0x9704 // vaddps ymm0, ymm0, ymmword ptr [rdi + 4*r10] LONG $0x5874a1c4; WORD $0x974c; BYTE $0x20 // vaddps ymm1, ymm1, ymmword ptr [rdi + 4*r10 + 32] LONG $0x586ca1c4; WORD $0x9754; BYTE $0x40 // vaddps ymm2, ymm2, ymmword ptr [rdi + 4*r10 + 64] LONG $0x5864a1c4; WORD $0x975c; BYTE $0x60 // vaddps ymm3, ymm3, ymmword ptr [rdi + 4*r10 + 96] LONG $0x117ca1c4; WORD $0x9204 // vmovups ymmword ptr [rdx + 4*r10], ymm0 LONG $0x117ca1c4; WORD $0x924c; BYTE $0x20 // vmovups ymmword ptr [rdx + 4*r10 + 32], ymm1 LONG $0x117ca1c4; WORD $0x9254; BYTE $0x40 // vmovups ymmword ptr [rdx + 4*r10 + 64], ymm2 LONG $0x117ca1c4; WORD $0x925c; BYTE $0x60 // vmovups ymmword ptr [rdx + 4*r10 + 96], ymm3 LONG $0x20c28349 // add r10, 32 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB59_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB59_12 LBB59_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB59_10 LBB59_9: LONG $0x107aa1c4; WORD $0x9e04 // vmovss xmm0, dword ptr [rsi + 4*r11] LONG $0x587aa1c4; WORD $0x9f04 // vaddss xmm0, xmm0, dword ptr [rdi + 4*r11] LONG $0x117aa1c4; WORD $0x9a04 // vmovss dword ptr [rdx + 4*r11], xmm0 WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB59_9 LBB59_10: LONG $0x03f98349 // cmp r9, 3 JB LBB59_12 LBB59_11: LONG $0x107aa1c4; WORD $0x9e04 // vmovss xmm0, dword ptr [rsi + 4*r11] LONG $0x587aa1c4; WORD $0x9f04 // vaddss xmm0, xmm0, dword ptr [rdi + 4*r11] LONG $0x117aa1c4; WORD $0x9a04 // vmovss dword ptr [rdx + 4*r11], xmm0 LONG $0x107aa1c4; WORD $0x9e44; BYTE $0x04 // vmovss xmm0, dword ptr [rsi + 4*r11 + 4] LONG $0x587aa1c4; WORD $0x9f44; BYTE $0x04 // vaddss xmm0, xmm0, dword ptr [rdi + 4*r11 + 4] LONG $0x117aa1c4; WORD $0x9a44; BYTE $0x04 // vmovss dword ptr [rdx + 4*r11 + 4], xmm0 LONG $0x107aa1c4; WORD $0x9e44; BYTE $0x08 // vmovss xmm0, dword ptr [rsi + 4*r11 + 8] LONG $0x587aa1c4; WORD $0x9f44; BYTE $0x08 // vaddss xmm0, xmm0, dword ptr [rdi + 4*r11 + 8] LONG $0x117aa1c4; WORD $0x9a44; BYTE $0x08 // vmovss dword ptr [rdx + 4*r11 + 8], xmm0 LONG $0x107aa1c4; WORD $0x9e44; BYTE $0x0c // vmovss xmm0, dword ptr [rsi + 4*r11 + 12] LONG $0x587aa1c4; WORD $0x9f44; BYTE $0x0c // vaddss xmm0, xmm0, dword ptr [rdi + 4*r11 + 12] LONG $0x117aa1c4; WORD $0x9a44; BYTE $0x0c // vmovss dword ptr [rdx + 4*r11 + 12], xmm0 LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB59_11 LBB59_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float32_sub(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB60_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x20f88349 // cmp r8, 32 JAE LBB60_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB60_8 LBB60_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB60_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB60_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x1fe18341 // and r9d, 31 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB60_6: LONG $0x107ca1c4; WORD $0x9704 // vmovups ymm0, ymmword ptr [rdi + 4*r10] LONG $0x107ca1c4; WORD $0x974c; BYTE $0x20 // vmovups ymm1, ymmword ptr [rdi + 4*r10 + 32] LONG $0x107ca1c4; WORD $0x9754; BYTE $0x40 // vmovups ymm2, ymmword ptr [rdi + 4*r10 + 64] LONG $0x107ca1c4; WORD $0x975c; BYTE $0x60 // vmovups ymm3, ymmword ptr [rdi + 4*r10 + 96] LONG $0x5c7ca1c4; WORD $0x9604 // vsubps ymm0, ymm0, ymmword ptr [rsi + 4*r10] LONG $0x5c74a1c4; WORD $0x964c; BYTE $0x20 // vsubps ymm1, ymm1, ymmword ptr [rsi + 4*r10 + 32] LONG $0x5c6ca1c4; WORD $0x9654; BYTE $0x40 // vsubps ymm2, ymm2, ymmword ptr [rsi + 4*r10 + 64] LONG $0x5c64a1c4; WORD $0x965c; BYTE $0x60 // vsubps ymm3, ymm3, ymmword ptr [rsi + 4*r10 + 96] LONG $0x117ca1c4; WORD $0x9204 // vmovups ymmword ptr [rdx + 4*r10], ymm0 LONG $0x117ca1c4; WORD $0x924c; BYTE $0x20 // vmovups ymmword ptr [rdx + 4*r10 + 32], ymm1 LONG $0x117ca1c4; WORD $0x9254; BYTE $0x40 // vmovups ymmword ptr [rdx + 4*r10 + 64], ymm2 LONG $0x117ca1c4; WORD $0x925c; BYTE $0x60 // vmovups ymmword ptr [rdx + 4*r10 + 96], ymm3 LONG $0x20c28349 // add r10, 32 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB60_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB60_12 LBB60_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB60_10 LBB60_9: LONG $0x107aa1c4; WORD $0x9f04 // vmovss xmm0, dword ptr [rdi + 4*r11] LONG $0x5c7aa1c4; WORD $0x9e04 // vsubss xmm0, xmm0, dword ptr [rsi + 4*r11] LONG $0x117aa1c4; WORD $0x9a04 // vmovss dword ptr [rdx + 4*r11], xmm0 WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB60_9 LBB60_10: LONG $0x03f98349 // cmp r9, 3 JB LBB60_12 LBB60_11: LONG $0x107aa1c4; WORD $0x9f04 // vmovss xmm0, dword ptr [rdi + 4*r11] LONG $0x5c7aa1c4; WORD $0x9e04 // vsubss xmm0, xmm0, dword ptr [rsi + 4*r11] LONG $0x117aa1c4; WORD $0x9a04 // vmovss dword ptr [rdx + 4*r11], xmm0 LONG $0x107aa1c4; WORD $0x9f44; BYTE $0x04 // vmovss xmm0, dword ptr [rdi + 4*r11 + 4] LONG $0x5c7aa1c4; WORD $0x9e44; BYTE $0x04 // vsubss xmm0, xmm0, dword ptr [rsi + 4*r11 + 4] LONG $0x117aa1c4; WORD $0x9a44; BYTE $0x04 // vmovss dword ptr [rdx + 4*r11 + 4], xmm0 LONG $0x107aa1c4; WORD $0x9f44; BYTE $0x08 // vmovss xmm0, dword ptr [rdi + 4*r11 + 8] LONG $0x5c7aa1c4; WORD $0x9e44; BYTE $0x08 // vsubss xmm0, xmm0, dword ptr [rsi + 4*r11 + 8] LONG $0x117aa1c4; WORD $0x9a44; BYTE $0x08 // vmovss dword ptr [rdx + 4*r11 + 8], xmm0 LONG $0x107aa1c4; WORD $0x9f44; BYTE $0x0c // vmovss xmm0, dword ptr [rdi + 4*r11 + 12] LONG $0x5c7aa1c4; WORD $0x9e44; BYTE $0x0c // vsubss xmm0, xmm0, dword ptr [rsi + 4*r11 + 12] LONG $0x117aa1c4; WORD $0x9a44; BYTE $0x0c // vmovss dword ptr [rdx + 4*r11 + 12], xmm0 LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB60_11 LBB60_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float32_mul(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB61_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x20f88349 // cmp r8, 32 JAE LBB61_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB61_8 LBB61_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB61_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB61_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x1fe18341 // and r9d, 31 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB61_6: LONG $0x107ca1c4; WORD $0x9604 // vmovups ymm0, ymmword ptr [rsi + 4*r10] LONG $0x107ca1c4; WORD $0x964c; BYTE $0x20 // vmovups ymm1, ymmword ptr [rsi + 4*r10 + 32] LONG $0x107ca1c4; WORD $0x9654; BYTE $0x40 // vmovups ymm2, ymmword ptr [rsi + 4*r10 + 64] LONG $0x107ca1c4; WORD $0x965c; BYTE $0x60 // vmovups ymm3, ymmword ptr [rsi + 4*r10 + 96] LONG $0x597ca1c4; WORD $0x9704 // vmulps ymm0, ymm0, ymmword ptr [rdi + 4*r10] LONG $0x5974a1c4; WORD $0x974c; BYTE $0x20 // vmulps ymm1, ymm1, ymmword ptr [rdi + 4*r10 + 32] LONG $0x596ca1c4; WORD $0x9754; BYTE $0x40 // vmulps ymm2, ymm2, ymmword ptr [rdi + 4*r10 + 64] LONG $0x5964a1c4; WORD $0x975c; BYTE $0x60 // vmulps ymm3, ymm3, ymmword ptr [rdi + 4*r10 + 96] LONG $0x117ca1c4; WORD $0x9204 // vmovups ymmword ptr [rdx + 4*r10], ymm0 LONG $0x117ca1c4; WORD $0x924c; BYTE $0x20 // vmovups ymmword ptr [rdx + 4*r10 + 32], ymm1 LONG $0x117ca1c4; WORD $0x9254; BYTE $0x40 // vmovups ymmword ptr [rdx + 4*r10 + 64], ymm2 LONG $0x117ca1c4; WORD $0x925c; BYTE $0x60 // vmovups ymmword ptr [rdx + 4*r10 + 96], ymm3 LONG $0x20c28349 // add r10, 32 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB61_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB61_12 LBB61_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB61_10 LBB61_9: LONG $0x107aa1c4; WORD $0x9e04 // vmovss xmm0, dword ptr [rsi + 4*r11] LONG $0x597aa1c4; WORD $0x9f04 // vmulss xmm0, xmm0, dword ptr [rdi + 4*r11] LONG $0x117aa1c4; WORD $0x9a04 // vmovss dword ptr [rdx + 4*r11], xmm0 WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB61_9 LBB61_10: LONG $0x03f98349 // cmp r9, 3 JB LBB61_12 LBB61_11: LONG $0x107aa1c4; WORD $0x9e04 // vmovss xmm0, dword ptr [rsi + 4*r11] LONG $0x597aa1c4; WORD $0x9f04 // vmulss xmm0, xmm0, dword ptr [rdi + 4*r11] LONG $0x117aa1c4; WORD $0x9a04 // vmovss dword ptr [rdx + 4*r11], xmm0 LONG $0x107aa1c4; WORD $0x9e44; BYTE $0x04 // vmovss xmm0, dword ptr [rsi + 4*r11 + 4] LONG $0x597aa1c4; WORD $0x9f44; BYTE $0x04 // vmulss xmm0, xmm0, dword ptr [rdi + 4*r11 + 4] LONG $0x117aa1c4; WORD $0x9a44; BYTE $0x04 // vmovss dword ptr [rdx + 4*r11 + 4], xmm0 LONG $0x107aa1c4; WORD $0x9e44; BYTE $0x08 // vmovss xmm0, dword ptr [rsi + 4*r11 + 8] LONG $0x597aa1c4; WORD $0x9f44; BYTE $0x08 // vmulss xmm0, xmm0, dword ptr [rdi + 4*r11 + 8] LONG $0x117aa1c4; WORD $0x9a44; BYTE $0x08 // vmovss dword ptr [rdx + 4*r11 + 8], xmm0 LONG $0x107aa1c4; WORD $0x9e44; BYTE $0x0c // vmovss xmm0, dword ptr [rsi + 4*r11 + 12] LONG $0x597aa1c4; WORD $0x9f44; BYTE $0x0c // vmulss xmm0, xmm0, dword ptr [rdi + 4*r11 + 12] LONG $0x117aa1c4; WORD $0x9a44; BYTE $0x0c // vmovss dword ptr [rdx + 4*r11 + 12], xmm0 LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB61_11 LBB61_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float32_div(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB62_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x20f88349 // cmp r8, 32 JAE LBB62_3 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB62_8 LBB62_3: WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB62_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB62_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x1fe18341 // and r9d, 31 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 WORD $0xc031 // xor eax, eax LBB62_6: LONG $0x0410fcc5; BYTE $0x87 // vmovups ymm0, ymmword ptr [rdi + 4*rax] LONG $0x4c10fcc5; WORD $0x2087 // vmovups ymm1, ymmword ptr [rdi + 4*rax + 32] LONG $0x5410fcc5; WORD $0x4087 // vmovups ymm2, ymmword ptr [rdi + 4*rax + 64] LONG $0x5c10fcc5; WORD $0x6087 // vmovups ymm3, ymmword ptr [rdi + 4*rax + 96] LONG $0x2410fcc5; BYTE $0x86 // vmovups ymm4, ymmword ptr [rsi + 4*rax] LONG $0x6c10fcc5; WORD $0x2086 // vmovups ymm5, ymmword ptr [rsi + 4*rax + 32] LONG $0x7410fcc5; WORD $0x4086 // vmovups ymm6, ymmword ptr [rsi + 4*rax + 64] LONG $0xfc53fcc5 // vrcpps ymm7, ymm4 LONG $0x44107cc5; WORD $0x6086 // vmovups ymm8, ymmword ptr [rsi + 4*rax + 96] LONG $0xcf597cc5 // vmulps ymm9, ymm0, ymm7 LONG $0xaa35e2c4; BYTE $0xe0 // vfmsub213ps ymm4, ymm9, ymm0 LONG $0xac45c2c4; BYTE $0xe1 // vfnmadd213ps ymm4, ymm7, ymm9 LONG $0xc553fcc5 // vrcpps ymm0, ymm5 LONG $0xf859f4c5 // vmulps ymm7, ymm1, ymm0 LONG $0xaa45e2c4; BYTE $0xe9 // vfmsub213ps ymm5, ymm7, ymm1 LONG $0xce53fcc5 // vrcpps ymm1, ymm6 LONG $0xac7de2c4; BYTE $0xef // vfnmadd213ps ymm5, ymm0, ymm7 LONG $0xc159ecc5 // vmulps ymm0, ymm2, ymm1 LONG $0xaa7de2c4; BYTE $0xf2 // vfmsub213ps ymm6, ymm0, ymm2 LONG $0xac75e2c4; BYTE $0xf0 // vfnmadd213ps ymm6, ymm1, ymm0 LONG $0x537cc1c4; BYTE $0xc0 // vrcpps ymm0, ymm8 LONG $0xc859e4c5 // vmulps ymm1, ymm3, ymm0 LONG $0xaa7562c4; BYTE $0xc3 // vfmsub213ps ymm8, ymm1, ymm3 LONG $0xac7d62c4; BYTE $0xc1 // vfnmadd213ps ymm8, ymm0, ymm1 LONG $0x2411fcc5; BYTE $0x82 // vmovups ymmword ptr [rdx + 4*rax], ymm4 LONG $0x6c11fcc5; WORD $0x2082 // vmovups ymmword ptr [rdx + 4*rax + 32], ymm5 LONG $0x7411fcc5; WORD $0x4082 // vmovups ymmword ptr [rdx + 4*rax + 64], ymm6 LONG $0x44117cc5; WORD $0x6082 // vmovups ymmword ptr [rdx + 4*rax + 96], ymm8 LONG $0x20c08348 // add rax, 32 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB62_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB62_12 LBB62_8: WORD $0x2944; BYTE $0xd1 // sub ecx, r10d LONG $0x01428d49 // lea rax, [r10 + 1] WORD $0xc1f6; BYTE $0x01 // test cl, 1 JE LBB62_10 LONG $0x107aa1c4; WORD $0x9704 // vmovss xmm0, dword ptr [rdi + 4*r10] LONG $0x5e7aa1c4; WORD $0x9604 // vdivss xmm0, xmm0, dword ptr [rsi + 4*r10] LONG $0x117aa1c4; WORD $0x9204 // vmovss dword ptr [rdx + 4*r10], xmm0 WORD $0x8949; BYTE $0xc2 // mov r10, rax LBB62_10: WORD $0x3949; BYTE $0xc0 // cmp r8, rax JE LBB62_12 LBB62_11: LONG $0x107aa1c4; WORD $0x9704 // vmovss xmm0, dword ptr [rdi + 4*r10] LONG $0x5e7aa1c4; WORD $0x9604 // vdivss xmm0, xmm0, dword ptr [rsi + 4*r10] LONG $0x117aa1c4; WORD $0x9204 // vmovss dword ptr [rdx + 4*r10], xmm0 LONG $0x107aa1c4; WORD $0x9744; BYTE $0x04 // vmovss xmm0, dword ptr [rdi + 4*r10 + 4] LONG $0x5e7aa1c4; WORD $0x9644; BYTE $0x04 // vdivss xmm0, xmm0, dword ptr [rsi + 4*r10 + 4] LONG $0x117aa1c4; WORD $0x9244; BYTE $0x04 // vmovss dword ptr [rdx + 4*r10 + 4], xmm0 LONG $0x02c28349 // add r10, 2 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB62_11 LBB62_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float64_sum(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xd285 // test edx, edx JLE LBB63_1 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB63_4 LONG $0xc057f9c5 // vxorpd xmm0, xmm0, xmm0 WORD $0xc931 // xor ecx, ecx JMP LBB63_7 LBB63_1: LONG $0xc057f9c5 // vxorpd xmm0, xmm0, xmm0 JMP LBB63_8 LBB63_4: WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0xc057f9c5 // vxorpd xmm0, xmm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc957f1c5 // vxorpd xmm1, xmm1, xmm1 LONG $0xd257e9c5 // vxorpd xmm2, xmm2, xmm2 LONG $0xdb57e1c5 // vxorpd xmm3, xmm3, xmm3 LBB63_5: LONG $0x0458fdc5; BYTE $0xc7 // vaddpd ymm0, ymm0, ymmword ptr [rdi + 8*rax] LONG $0x4c58f5c5; WORD $0x20c7 // vaddpd ymm1, ymm1, ymmword ptr [rdi + 8*rax + 32] LONG $0x5458edc5; WORD $0x40c7 // vaddpd ymm2, ymm2, ymmword ptr [rdi + 8*rax + 64] LONG $0x5c58e5c5; WORD $0x60c7 // vaddpd ymm3, ymm3, ymmword ptr [rdi + 8*rax + 96] LONG $0x10c08348 // add rax, 16 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB63_5 LONG $0xc058f5c5 // vaddpd ymm0, ymm1, ymm0 LONG $0xc058edc5 // vaddpd ymm0, ymm2, ymm0 LONG $0xc058e5c5 // vaddpd ymm0, ymm3, ymm0 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 LONG $0xc158f9c5 // vaddpd xmm0, xmm0, xmm1 LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd xmm1, xmm0, 1 LONG $0xc158fbc5 // vaddsd xmm0, xmm0, xmm1 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB63_8 LBB63_7: LONG $0x0458fbc5; BYTE $0xcf // vaddsd xmm0, xmm0, qword ptr [rdi + 8*rcx] WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB63_7 LBB63_8: LONG $0x0611fbc5 // vmovsd qword ptr [rsi], xmm0 WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float64_min(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 LONG $0x0710fbc5 // vmovsd xmm0, qword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB64_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB64_3 WORD $0xc931 // xor ecx, ecx JMP LBB64_6 LBB64_3: WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0x197de2c4; BYTE $0xc0 // vbroadcastsd ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc828fdc5 // vmovapd ymm1, ymm0 LONG $0xd028fdc5 // vmovapd ymm2, ymm0 LONG $0xd828fdc5 // vmovapd ymm3, ymm0 LBB64_4: LONG $0x045dfdc5; BYTE $0xc7 // vminpd ymm0, ymm0, ymmword ptr [rdi + 8*rax] LONG $0x4c5df5c5; WORD $0x20c7 // vminpd ymm1, ymm1, ymmword ptr [rdi + 8*rax + 32] LONG $0x545dedc5; WORD $0x40c7 // vminpd ymm2, ymm2, ymmword ptr [rdi + 8*rax + 64] LONG $0x5c5de5c5; WORD $0x60c7 // vminpd ymm3, ymm3, ymmword ptr [rdi + 8*rax + 96] LONG $0x10c08348 // add rax, 16 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB64_4 LONG $0xc15dfdc5 // vminpd ymm0, ymm0, ymm1 LONG $0xc25dfdc5 // vminpd ymm0, ymm0, ymm2 LONG $0xc35dfdc5 // vminpd ymm0, ymm0, ymm3 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 LONG $0xc15df9c5 // vminpd xmm0, xmm0, xmm1 LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd xmm1, xmm0, 1 LONG $0xc15dfbc5 // vminsd xmm0, xmm0, xmm1 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB64_7 LBB64_6: LONG $0x045dfbc5; BYTE $0xcf // vminsd xmm0, xmm0, qword ptr [rdi + 8*rcx] WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB64_6 LBB64_7: LONG $0x0611fbc5 // vmovsd qword ptr [rsi], xmm0 WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float64_max(SB), $0-32 MOVQ input+0(FP), DI MOVQ result+8(FP), SI MOVQ size+16(FP), DX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 LONG $0x0710fbc5 // vmovsd xmm0, qword ptr [rdi] WORD $0xd285 // test edx, edx JLE LBB65_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx LONG $0x10f88349 // cmp r8, 16 JAE LBB65_3 WORD $0xc931 // xor ecx, ecx JMP LBB65_6 LBB65_3: WORD $0xe283; BYTE $0x0f // and edx, 15 WORD $0x894c; BYTE $0xc1 // mov rcx, r8 WORD $0x2948; BYTE $0xd1 // sub rcx, rdx LONG $0x197de2c4; BYTE $0xc0 // vbroadcastsd ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xc828fdc5 // vmovapd ymm1, ymm0 LONG $0xd028fdc5 // vmovapd ymm2, ymm0 LONG $0xd828fdc5 // vmovapd ymm3, ymm0 LBB65_4: LONG $0x045ffdc5; BYTE $0xc7 // vmaxpd ymm0, ymm0, ymmword ptr [rdi + 8*rax] LONG $0x4c5ff5c5; WORD $0x20c7 // vmaxpd ymm1, ymm1, ymmword ptr [rdi + 8*rax + 32] LONG $0x545fedc5; WORD $0x40c7 // vmaxpd ymm2, ymm2, ymmword ptr [rdi + 8*rax + 64] LONG $0x5c5fe5c5; WORD $0x60c7 // vmaxpd ymm3, ymm3, ymmword ptr [rdi + 8*rax + 96] LONG $0x10c08348 // add rax, 16 WORD $0x3948; BYTE $0xc1 // cmp rcx, rax JNE LBB65_4 LONG $0xc15ffdc5 // vmaxpd ymm0, ymm0, ymm1 LONG $0xc25ffdc5 // vmaxpd ymm0, ymm0, ymm2 LONG $0xc35ffdc5 // vmaxpd ymm0, ymm0, ymm3 LONG $0x197de3c4; WORD $0x01c1 // vextractf128 xmm1, ymm0, 1 LONG $0xc15ff9c5 // vmaxpd xmm0, xmm0, xmm1 LONG $0x0579e3c4; WORD $0x01c8 // vpermilpd xmm1, xmm0, 1 LONG $0xc15ffbc5 // vmaxsd xmm0, xmm0, xmm1 WORD $0x8548; BYTE $0xd2 // test rdx, rdx JE LBB65_7 LBB65_6: LONG $0x045ffbc5; BYTE $0xcf // vmaxsd xmm0, xmm0, qword ptr [rdi + 8*rcx] WORD $0xff48; BYTE $0xc1 // inc rcx WORD $0x3949; BYTE $0xc8 // cmp r8, rcx JNE LBB65_6 LBB65_7: LONG $0x0611fbc5 // vmovsd qword ptr [rsi], xmm0 WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float64_add(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB66_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB66_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB66_8 LBB66_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB66_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB66_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x0fe18341 // and r9d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB66_6: LONG $0x107da1c4; WORD $0xd604 // vmovupd ymm0, ymmword ptr [rsi + 8*r10] LONG $0x107da1c4; WORD $0xd64c; BYTE $0x20 // vmovupd ymm1, ymmword ptr [rsi + 8*r10 + 32] LONG $0x107da1c4; WORD $0xd654; BYTE $0x40 // vmovupd ymm2, ymmword ptr [rsi + 8*r10 + 64] LONG $0x107da1c4; WORD $0xd65c; BYTE $0x60 // vmovupd ymm3, ymmword ptr [rsi + 8*r10 + 96] LONG $0x587da1c4; WORD $0xd704 // vaddpd ymm0, ymm0, ymmword ptr [rdi + 8*r10] LONG $0x5875a1c4; WORD $0xd74c; BYTE $0x20 // vaddpd ymm1, ymm1, ymmword ptr [rdi + 8*r10 + 32] LONG $0x586da1c4; WORD $0xd754; BYTE $0x40 // vaddpd ymm2, ymm2, ymmword ptr [rdi + 8*r10 + 64] LONG $0x5865a1c4; WORD $0xd75c; BYTE $0x60 // vaddpd ymm3, ymm3, ymmword ptr [rdi + 8*r10 + 96] LONG $0x117da1c4; WORD $0xd204 // vmovupd ymmword ptr [rdx + 8*r10], ymm0 LONG $0x117da1c4; WORD $0xd24c; BYTE $0x20 // vmovupd ymmword ptr [rdx + 8*r10 + 32], ymm1 LONG $0x117da1c4; WORD $0xd254; BYTE $0x40 // vmovupd ymmword ptr [rdx + 8*r10 + 64], ymm2 LONG $0x117da1c4; WORD $0xd25c; BYTE $0x60 // vmovupd ymmword ptr [rdx + 8*r10 + 96], ymm3 LONG $0x10c28349 // add r10, 16 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB66_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB66_12 LBB66_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB66_10 LBB66_9: LONG $0x107ba1c4; WORD $0xde04 // vmovsd xmm0, qword ptr [rsi + 8*r11] LONG $0x587ba1c4; WORD $0xdf04 // vaddsd xmm0, xmm0, qword ptr [rdi + 8*r11] LONG $0x117ba1c4; WORD $0xda04 // vmovsd qword ptr [rdx + 8*r11], xmm0 WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB66_9 LBB66_10: LONG $0x03f98349 // cmp r9, 3 JB LBB66_12 LBB66_11: LONG $0x107ba1c4; WORD $0xde04 // vmovsd xmm0, qword ptr [rsi + 8*r11] LONG $0x587ba1c4; WORD $0xdf04 // vaddsd xmm0, xmm0, qword ptr [rdi + 8*r11] LONG $0x117ba1c4; WORD $0xda04 // vmovsd qword ptr [rdx + 8*r11], xmm0 LONG $0x107ba1c4; WORD $0xde44; BYTE $0x08 // vmovsd xmm0, qword ptr [rsi + 8*r11 + 8] LONG $0x587ba1c4; WORD $0xdf44; BYTE $0x08 // vaddsd xmm0, xmm0, qword ptr [rdi + 8*r11 + 8] LONG $0x117ba1c4; WORD $0xda44; BYTE $0x08 // vmovsd qword ptr [rdx + 8*r11 + 8], xmm0 LONG $0x107ba1c4; WORD $0xde44; BYTE $0x10 // vmovsd xmm0, qword ptr [rsi + 8*r11 + 16] LONG $0x587ba1c4; WORD $0xdf44; BYTE $0x10 // vaddsd xmm0, xmm0, qword ptr [rdi + 8*r11 + 16] LONG $0x117ba1c4; WORD $0xda44; BYTE $0x10 // vmovsd qword ptr [rdx + 8*r11 + 16], xmm0 LONG $0x107ba1c4; WORD $0xde44; BYTE $0x18 // vmovsd xmm0, qword ptr [rsi + 8*r11 + 24] LONG $0x587ba1c4; WORD $0xdf44; BYTE $0x18 // vaddsd xmm0, xmm0, qword ptr [rdi + 8*r11 + 24] LONG $0x117ba1c4; WORD $0xda44; BYTE $0x18 // vmovsd qword ptr [rdx + 8*r11 + 24], xmm0 LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB66_11 LBB66_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float64_sub(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB67_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB67_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB67_8 LBB67_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB67_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB67_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x0fe18341 // and r9d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB67_6: LONG $0x107da1c4; WORD $0xd704 // vmovupd ymm0, ymmword ptr [rdi + 8*r10] LONG $0x107da1c4; WORD $0xd74c; BYTE $0x20 // vmovupd ymm1, ymmword ptr [rdi + 8*r10 + 32] LONG $0x107da1c4; WORD $0xd754; BYTE $0x40 // vmovupd ymm2, ymmword ptr [rdi + 8*r10 + 64] LONG $0x107da1c4; WORD $0xd75c; BYTE $0x60 // vmovupd ymm3, ymmword ptr [rdi + 8*r10 + 96] LONG $0x5c7da1c4; WORD $0xd604 // vsubpd ymm0, ymm0, ymmword ptr [rsi + 8*r10] LONG $0x5c75a1c4; WORD $0xd64c; BYTE $0x20 // vsubpd ymm1, ymm1, ymmword ptr [rsi + 8*r10 + 32] LONG $0x5c6da1c4; WORD $0xd654; BYTE $0x40 // vsubpd ymm2, ymm2, ymmword ptr [rsi + 8*r10 + 64] LONG $0x5c65a1c4; WORD $0xd65c; BYTE $0x60 // vsubpd ymm3, ymm3, ymmword ptr [rsi + 8*r10 + 96] LONG $0x117da1c4; WORD $0xd204 // vmovupd ymmword ptr [rdx + 8*r10], ymm0 LONG $0x117da1c4; WORD $0xd24c; BYTE $0x20 // vmovupd ymmword ptr [rdx + 8*r10 + 32], ymm1 LONG $0x117da1c4; WORD $0xd254; BYTE $0x40 // vmovupd ymmword ptr [rdx + 8*r10 + 64], ymm2 LONG $0x117da1c4; WORD $0xd25c; BYTE $0x60 // vmovupd ymmword ptr [rdx + 8*r10 + 96], ymm3 LONG $0x10c28349 // add r10, 16 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB67_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB67_12 LBB67_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB67_10 LBB67_9: LONG $0x107ba1c4; WORD $0xdf04 // vmovsd xmm0, qword ptr [rdi + 8*r11] LONG $0x5c7ba1c4; WORD $0xde04 // vsubsd xmm0, xmm0, qword ptr [rsi + 8*r11] LONG $0x117ba1c4; WORD $0xda04 // vmovsd qword ptr [rdx + 8*r11], xmm0 WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB67_9 LBB67_10: LONG $0x03f98349 // cmp r9, 3 JB LBB67_12 LBB67_11: LONG $0x107ba1c4; WORD $0xdf04 // vmovsd xmm0, qword ptr [rdi + 8*r11] LONG $0x5c7ba1c4; WORD $0xde04 // vsubsd xmm0, xmm0, qword ptr [rsi + 8*r11] LONG $0x117ba1c4; WORD $0xda04 // vmovsd qword ptr [rdx + 8*r11], xmm0 LONG $0x107ba1c4; WORD $0xdf44; BYTE $0x08 // vmovsd xmm0, qword ptr [rdi + 8*r11 + 8] LONG $0x5c7ba1c4; WORD $0xde44; BYTE $0x08 // vsubsd xmm0, xmm0, qword ptr [rsi + 8*r11 + 8] LONG $0x117ba1c4; WORD $0xda44; BYTE $0x08 // vmovsd qword ptr [rdx + 8*r11 + 8], xmm0 LONG $0x107ba1c4; WORD $0xdf44; BYTE $0x10 // vmovsd xmm0, qword ptr [rdi + 8*r11 + 16] LONG $0x5c7ba1c4; WORD $0xde44; BYTE $0x10 // vsubsd xmm0, xmm0, qword ptr [rsi + 8*r11 + 16] LONG $0x117ba1c4; WORD $0xda44; BYTE $0x10 // vmovsd qword ptr [rdx + 8*r11 + 16], xmm0 LONG $0x107ba1c4; WORD $0xdf44; BYTE $0x18 // vmovsd xmm0, qword ptr [rdi + 8*r11 + 24] LONG $0x5c7ba1c4; WORD $0xde44; BYTE $0x18 // vsubsd xmm0, xmm0, qword ptr [rsi + 8*r11 + 24] LONG $0x117ba1c4; WORD $0xda44; BYTE $0x18 // vmovsd qword ptr [rdx + 8*r11 + 24], xmm0 LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB67_11 LBB67_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float64_mul(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB68_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x10f88349 // cmp r8, 16 JAE LBB68_3 WORD $0x3145; BYTE $0xdb // xor r11d, r11d JMP LBB68_8 LBB68_3: WORD $0x8949; BYTE $0xd1 // mov r9, rdx WORD $0x2949; BYTE $0xf9 // sub r9, rdi WORD $0x3145; BYTE $0xdb // xor r11d, r11d LONG $0x80f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 128 JB LBB68_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x00803d48; WORD $0x0000 // cmp rax, 128 JB LBB68_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x0fe18341 // and r9d, 15 WORD $0x894d; BYTE $0xc3 // mov r11, r8 WORD $0x294d; BYTE $0xcb // sub r11, r9 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LBB68_6: LONG $0x107da1c4; WORD $0xd604 // vmovupd ymm0, ymmword ptr [rsi + 8*r10] LONG $0x107da1c4; WORD $0xd64c; BYTE $0x20 // vmovupd ymm1, ymmword ptr [rsi + 8*r10 + 32] LONG $0x107da1c4; WORD $0xd654; BYTE $0x40 // vmovupd ymm2, ymmword ptr [rsi + 8*r10 + 64] LONG $0x107da1c4; WORD $0xd65c; BYTE $0x60 // vmovupd ymm3, ymmword ptr [rsi + 8*r10 + 96] LONG $0x597da1c4; WORD $0xd704 // vmulpd ymm0, ymm0, ymmword ptr [rdi + 8*r10] LONG $0x5975a1c4; WORD $0xd74c; BYTE $0x20 // vmulpd ymm1, ymm1, ymmword ptr [rdi + 8*r10 + 32] LONG $0x596da1c4; WORD $0xd754; BYTE $0x40 // vmulpd ymm2, ymm2, ymmword ptr [rdi + 8*r10 + 64] LONG $0x5965a1c4; WORD $0xd75c; BYTE $0x60 // vmulpd ymm3, ymm3, ymmword ptr [rdi + 8*r10 + 96] LONG $0x117da1c4; WORD $0xd204 // vmovupd ymmword ptr [rdx + 8*r10], ymm0 LONG $0x117da1c4; WORD $0xd24c; BYTE $0x20 // vmovupd ymmword ptr [rdx + 8*r10 + 32], ymm1 LONG $0x117da1c4; WORD $0xd254; BYTE $0x40 // vmovupd ymmword ptr [rdx + 8*r10 + 64], ymm2 LONG $0x117da1c4; WORD $0xd25c; BYTE $0x60 // vmovupd ymmword ptr [rdx + 8*r10 + 96], ymm3 LONG $0x10c28349 // add r10, 16 WORD $0x394d; BYTE $0xd3 // cmp r11, r10 JNE LBB68_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB68_12 LBB68_8: WORD $0x2944; BYTE $0xd9 // sub ecx, r11d WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 LONG $0x03e18348 // and rcx, 3 JE LBB68_10 LBB68_9: LONG $0x107ba1c4; WORD $0xde04 // vmovsd xmm0, qword ptr [rsi + 8*r11] LONG $0x597ba1c4; WORD $0xdf04 // vmulsd xmm0, xmm0, qword ptr [rdi + 8*r11] LONG $0x117ba1c4; WORD $0xda04 // vmovsd qword ptr [rdx + 8*r11], xmm0 WORD $0xff49; BYTE $0xc3 // inc r11 WORD $0xff48; BYTE $0xc9 // dec rcx JNE LBB68_9 LBB68_10: LONG $0x03f98349 // cmp r9, 3 JB LBB68_12 LBB68_11: LONG $0x107ba1c4; WORD $0xde04 // vmovsd xmm0, qword ptr [rsi + 8*r11] LONG $0x597ba1c4; WORD $0xdf04 // vmulsd xmm0, xmm0, qword ptr [rdi + 8*r11] LONG $0x117ba1c4; WORD $0xda04 // vmovsd qword ptr [rdx + 8*r11], xmm0 LONG $0x107ba1c4; WORD $0xde44; BYTE $0x08 // vmovsd xmm0, qword ptr [rsi + 8*r11 + 8] LONG $0x597ba1c4; WORD $0xdf44; BYTE $0x08 // vmulsd xmm0, xmm0, qword ptr [rdi + 8*r11 + 8] LONG $0x117ba1c4; WORD $0xda44; BYTE $0x08 // vmovsd qword ptr [rdx + 8*r11 + 8], xmm0 LONG $0x107ba1c4; WORD $0xde44; BYTE $0x10 // vmovsd xmm0, qword ptr [rsi + 8*r11 + 16] LONG $0x597ba1c4; WORD $0xdf44; BYTE $0x10 // vmulsd xmm0, xmm0, qword ptr [rdi + 8*r11 + 16] LONG $0x117ba1c4; WORD $0xda44; BYTE $0x10 // vmovsd qword ptr [rdx + 8*r11 + 16], xmm0 LONG $0x107ba1c4; WORD $0xde44; BYTE $0x18 // vmovsd xmm0, qword ptr [rsi + 8*r11 + 24] LONG $0x597ba1c4; WORD $0xdf44; BYTE $0x18 // vmulsd xmm0, xmm0, qword ptr [rdi + 8*r11 + 24] LONG $0x117ba1c4; WORD $0xda44; BYTE $0x18 // vmovsd qword ptr [rdx + 8*r11 + 24], xmm0 LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB68_11 LBB68_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret TEXT ·_float64_div(SB), $0-32 MOVQ input1+0(FP), DI MOVQ input2+8(FP), SI MOVQ output+16(FP), DX MOVQ size+24(FP), CX BYTE $0x55 // push rbp WORD $0x8948; BYTE $0xe5 // mov rbp, rsp LONG $0xf8e48348 // and rsp, -8 WORD $0xc985 // test ecx, ecx JLE LBB69_12 WORD $0x8941; BYTE $0xc8 // mov r8d, ecx LONG $0x04f88349 // cmp r8, 4 JAE LBB69_3 WORD $0x3145; BYTE $0xd2 // xor r10d, r10d JMP LBB69_8 LBB69_3: WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf8 // sub rax, rdi WORD $0x3145; BYTE $0xd2 // xor r10d, r10d LONG $0x20f88348 // cmp rax, 32 JB LBB69_8 WORD $0x8948; BYTE $0xd0 // mov rax, rdx WORD $0x2948; BYTE $0xf0 // sub rax, rsi LONG $0x20f88348 // cmp rax, 32 JB LBB69_8 WORD $0x8941; BYTE $0xc9 // mov r9d, ecx LONG $0x03e18341 // and r9d, 3 WORD $0x894d; BYTE $0xc2 // mov r10, r8 WORD $0x294d; BYTE $0xca // sub r10, r9 WORD $0xc031 // xor eax, eax LBB69_6: LONG $0x0410fdc5; BYTE $0xc7 // vmovupd ymm0, ymmword ptr [rdi + 8*rax] LONG $0x045efdc5; BYTE $0xc6 // vdivpd ymm0, ymm0, ymmword ptr [rsi + 8*rax] LONG $0x0411fdc5; BYTE $0xc2 // vmovupd ymmword ptr [rdx + 8*rax], ymm0 LONG $0x04c08348 // add rax, 4 WORD $0x3949; BYTE $0xc2 // cmp r10, rax JNE LBB69_6 WORD $0x854d; BYTE $0xc9 // test r9, r9 JE LBB69_12 LBB69_8: WORD $0x2944; BYTE $0xd1 // sub ecx, r10d LONG $0x014a8d4d // lea r9, [r10 + 1] WORD $0xc1f6; BYTE $0x01 // test cl, 1 JE LBB69_10 LONG $0x107ba1c4; WORD $0xd704 // vmovsd xmm0, qword ptr [rdi + 8*r10] LONG $0x5e7ba1c4; WORD $0xd604 // vdivsd xmm0, xmm0, qword ptr [rsi + 8*r10] LONG $0x117ba1c4; WORD $0xd204 // vmovsd qword ptr [rdx + 8*r10], xmm0 WORD $0x894d; BYTE $0xca // mov r10, r9 LBB69_10: WORD $0x394d; BYTE $0xc8 // cmp r8, r9 JE LBB69_12 LBB69_11: LONG $0x107ba1c4; WORD $0xd704 // vmovsd xmm0, qword ptr [rdi + 8*r10] LONG $0x5e7ba1c4; WORD $0xd604 // vdivsd xmm0, xmm0, qword ptr [rsi + 8*r10] LONG $0x117ba1c4; WORD $0xd204 // vmovsd qword ptr [rdx + 8*r10], xmm0 LONG $0x107ba1c4; WORD $0xd744; BYTE $0x08 // vmovsd xmm0, qword ptr [rdi + 8*r10 + 8] LONG $0x5e7ba1c4; WORD $0xd644; BYTE $0x08 // vdivsd xmm0, xmm0, qword ptr [rsi + 8*r10 + 8] LONG $0x117ba1c4; WORD $0xd244; BYTE $0x08 // vmovsd qword ptr [rdx + 8*r10 + 8], xmm0 LONG $0x02c28349 // add r10, 2 WORD $0x394d; BYTE $0xd0 // cmp r8, r10 JNE LBB69_11 LBB69_12: WORD $0x8948; BYTE $0xec // mov rsp, rbp BYTE $0x5d // pop rbp WORD $0xf8c5; BYTE $0x77 // vzeroupper BYTE $0xc3 // ret golang-github-kelindar-simd-1.2.0/simd_funcs.go000066400000000000000000000546131517522302000214510ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. package simd import "unsafe" // ---------------------------------- Uint8 ---------------------------------- // SumUint8s sums up all of the elements of the slice and returns the value func SumUint8s(input []uint8) (out uint8) { switch { case hardware: _uint8_sum(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return sum(input) } } // MinUint8s returns the smallest element value in the slice func MinUint8s(input []uint8) (out uint8) { switch { case hardware: _uint8_min(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return min(input) } } // MaxUint8s returns the largest element value in the slice func MaxUint8s(input []uint8) (out uint8) { switch { case hardware: _uint8_max(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return max(input) } } // AddUint8s adds input1 to input2 and writes back the result into dst slice func AddUint8s(dst, input1, input2 []uint8) []uint8 { switch { case hardware: _uint8_add(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return add(dst, input1, input2) } // SubUint8s subtracts input2 from input1 and writes back the result into dst slice func SubUint8s(dst, input1, input2 []uint8) []uint8 { switch { case hardware: _uint8_sub(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return sub(dst, input1, input2) } // MulUint8s multiplies input1 by input2 and writes back the result into dst slice func MulUint8s(dst, input1, input2 []uint8) []uint8 { switch { case hardware: _uint8_mul(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return mul(dst, input1, input2) } // DivUint8s divides input1 by input2 and writes back the result into dst slice func DivUint8s(dst, input1, input2 []uint8) []uint8 { switch { case hardware: _uint8_div(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return div(dst, input1, input2) } // ---------------------------------- Uint16 ---------------------------------- // SumUint16s sums up all of the elements of the slice and returns the value func SumUint16s(input []uint16) (out uint16) { switch { case hardware: _uint16_sum(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return sum(input) } } // MinUint16s returns the smallest element value in the slice func MinUint16s(input []uint16) (out uint16) { switch { case hardware: _uint16_min(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return min(input) } } // MaxUint16s returns the largest element value in the slice func MaxUint16s(input []uint16) (out uint16) { switch { case hardware: _uint16_max(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return max(input) } } // AddUint16s adds input1 to input2 and writes back the result into dst slice func AddUint16s(dst, input1, input2 []uint16) []uint16 { switch { case hardware: _uint16_add(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return add(dst, input1, input2) } // SubUint16s subtracts input2 from input1 and writes back the result into dst slice func SubUint16s(dst, input1, input2 []uint16) []uint16 { switch { case hardware: _uint16_sub(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return sub(dst, input1, input2) } // MulUint16s multiplies input1 by input2 and writes back the result into dst slice func MulUint16s(dst, input1, input2 []uint16) []uint16 { switch { case hardware: _uint16_mul(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return mul(dst, input1, input2) } // DivUint16s divides input1 by input2 and writes back the result into dst slice func DivUint16s(dst, input1, input2 []uint16) []uint16 { switch { case hardware: _uint16_div(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return div(dst, input1, input2) } // ---------------------------------- Uint32 ---------------------------------- // SumUint32s sums up all of the elements of the slice and returns the value func SumUint32s(input []uint32) (out uint32) { switch { case hardware: _uint32_sum(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return sum(input) } } // MinUint32s returns the smallest element value in the slice func MinUint32s(input []uint32) (out uint32) { switch { case hardware: _uint32_min(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return min(input) } } // MaxUint32s returns the largest element value in the slice func MaxUint32s(input []uint32) (out uint32) { switch { case hardware: _uint32_max(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return max(input) } } // AddUint32s adds input1 to input2 and writes back the result into dst slice func AddUint32s(dst, input1, input2 []uint32) []uint32 { switch { case hardware: _uint32_add(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return add(dst, input1, input2) } // SubUint32s subtracts input2 from input1 and writes back the result into dst slice func SubUint32s(dst, input1, input2 []uint32) []uint32 { switch { case hardware: _uint32_sub(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return sub(dst, input1, input2) } // MulUint32s multiplies input1 by input2 and writes back the result into dst slice func MulUint32s(dst, input1, input2 []uint32) []uint32 { switch { case hardware: _uint32_mul(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return mul(dst, input1, input2) } // DivUint32s divides input1 by input2 and writes back the result into dst slice func DivUint32s(dst, input1, input2 []uint32) []uint32 { switch { case hardware: _uint32_div(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return div(dst, input1, input2) } // ---------------------------------- Uint64 ---------------------------------- // SumUint64s sums up all of the elements of the slice and returns the value func SumUint64s(input []uint64) (out uint64) { switch { case hardware: _uint64_sum(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return sum(input) } } // MinUint64s returns the smallest element value in the slice func MinUint64s(input []uint64) (out uint64) { switch { case hardware: _uint64_min(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return min(input) } } // MaxUint64s returns the largest element value in the slice func MaxUint64s(input []uint64) (out uint64) { switch { case hardware: _uint64_max(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return max(input) } } // AddUint64s adds input1 to input2 and writes back the result into dst slice func AddUint64s(dst, input1, input2 []uint64) []uint64 { switch { case hardware: _uint64_add(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return add(dst, input1, input2) } // SubUint64s subtracts input2 from input1 and writes back the result into dst slice func SubUint64s(dst, input1, input2 []uint64) []uint64 { switch { case hardware: _uint64_sub(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return sub(dst, input1, input2) } // MulUint64s multiplies input1 by input2 and writes back the result into dst slice func MulUint64s(dst, input1, input2 []uint64) []uint64 { switch { case hardware: _uint64_mul(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return mul(dst, input1, input2) } // DivUint64s divides input1 by input2 and writes back the result into dst slice func DivUint64s(dst, input1, input2 []uint64) []uint64 { switch { case hardware: _uint64_div(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return div(dst, input1, input2) } // ---------------------------------- Int8 ---------------------------------- // SumInt8s sums up all of the elements of the slice and returns the value func SumInt8s(input []int8) (out int8) { switch { case hardware: _int8_sum(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return sum(input) } } // MinInt8s returns the smallest element value in the slice func MinInt8s(input []int8) (out int8) { switch { case hardware: _int8_min(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return min(input) } } // MaxInt8s returns the largest element value in the slice func MaxInt8s(input []int8) (out int8) { switch { case hardware: _int8_max(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return max(input) } } // AddInt8s adds input1 to input2 and writes back the result into dst slice func AddInt8s(dst, input1, input2 []int8) []int8 { switch { case hardware: _int8_add(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return add(dst, input1, input2) } // SubInt8s subtracts input2 from input1 and writes back the result into dst slice func SubInt8s(dst, input1, input2 []int8) []int8 { switch { case hardware: _int8_sub(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return sub(dst, input1, input2) } // MulInt8s multiplies input1 by input2 and writes back the result into dst slice func MulInt8s(dst, input1, input2 []int8) []int8 { switch { case hardware: _int8_mul(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return mul(dst, input1, input2) } // DivInt8s divides input1 by input2 and writes back the result into dst slice func DivInt8s(dst, input1, input2 []int8) []int8 { switch { case hardware: _int8_div(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return div(dst, input1, input2) } // ---------------------------------- Int16 ---------------------------------- // SumInt16s sums up all of the elements of the slice and returns the value func SumInt16s(input []int16) (out int16) { switch { case hardware: _int16_sum(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return sum(input) } } // MinInt16s returns the smallest element value in the slice func MinInt16s(input []int16) (out int16) { switch { case hardware: _int16_min(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return min(input) } } // MaxInt16s returns the largest element value in the slice func MaxInt16s(input []int16) (out int16) { switch { case hardware: _int16_max(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return max(input) } } // AddInt16s adds input1 to input2 and writes back the result into dst slice func AddInt16s(dst, input1, input2 []int16) []int16 { switch { case hardware: _int16_add(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return add(dst, input1, input2) } // SubInt16s subtracts input2 from input1 and writes back the result into dst slice func SubInt16s(dst, input1, input2 []int16) []int16 { switch { case hardware: _int16_sub(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return sub(dst, input1, input2) } // MulInt16s multiplies input1 by input2 and writes back the result into dst slice func MulInt16s(dst, input1, input2 []int16) []int16 { switch { case hardware: _int16_mul(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return mul(dst, input1, input2) } // DivInt16s divides input1 by input2 and writes back the result into dst slice func DivInt16s(dst, input1, input2 []int16) []int16 { switch { case hardware: _int16_div(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return div(dst, input1, input2) } // ---------------------------------- Int32 ---------------------------------- // SumInt32s sums up all of the elements of the slice and returns the value func SumInt32s(input []int32) (out int32) { switch { case hardware: _int32_sum(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return sum(input) } } // MinInt32s returns the smallest element value in the slice func MinInt32s(input []int32) (out int32) { switch { case hardware: _int32_min(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return min(input) } } // MaxInt32s returns the largest element value in the slice func MaxInt32s(input []int32) (out int32) { switch { case hardware: _int32_max(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return max(input) } } // AddInt32s adds input1 to input2 and writes back the result into dst slice func AddInt32s(dst, input1, input2 []int32) []int32 { switch { case hardware: _int32_add(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return add(dst, input1, input2) } // SubInt32s subtracts input2 from input1 and writes back the result into dst slice func SubInt32s(dst, input1, input2 []int32) []int32 { switch { case hardware: _int32_sub(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return sub(dst, input1, input2) } // MulInt32s multiplies input1 by input2 and writes back the result into dst slice func MulInt32s(dst, input1, input2 []int32) []int32 { switch { case hardware: _int32_mul(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return mul(dst, input1, input2) } // DivInt32s divides input1 by input2 and writes back the result into dst slice func DivInt32s(dst, input1, input2 []int32) []int32 { switch { case hardware: _int32_div(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return div(dst, input1, input2) } // ---------------------------------- Int64 ---------------------------------- // SumInt64s sums up all of the elements of the slice and returns the value func SumInt64s(input []int64) (out int64) { switch { case hardware: _int64_sum(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return sum(input) } } // MinInt64s returns the smallest element value in the slice func MinInt64s(input []int64) (out int64) { switch { case hardware: _int64_min(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return min(input) } } // MaxInt64s returns the largest element value in the slice func MaxInt64s(input []int64) (out int64) { switch { case hardware: _int64_max(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return max(input) } } // AddInt64s adds input1 to input2 and writes back the result into dst slice func AddInt64s(dst, input1, input2 []int64) []int64 { switch { case hardware: _int64_add(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return add(dst, input1, input2) } // SubInt64s subtracts input2 from input1 and writes back the result into dst slice func SubInt64s(dst, input1, input2 []int64) []int64 { switch { case hardware: _int64_sub(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return sub(dst, input1, input2) } // MulInt64s multiplies input1 by input2 and writes back the result into dst slice func MulInt64s(dst, input1, input2 []int64) []int64 { switch { case hardware: _int64_mul(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return mul(dst, input1, input2) } // DivInt64s divides input1 by input2 and writes back the result into dst slice func DivInt64s(dst, input1, input2 []int64) []int64 { switch { case hardware: _int64_div(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return div(dst, input1, input2) } // ---------------------------------- Float32 ---------------------------------- // SumFloat32s sums up all of the elements of the slice and returns the value func SumFloat32s(input []float32) (out float32) { switch { case hardware: _float32_sum(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return sum(input) } } // MinFloat32s returns the smallest element value in the slice func MinFloat32s(input []float32) (out float32) { switch { case hardware: _float32_min(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return min(input) } } // MaxFloat32s returns the largest element value in the slice func MaxFloat32s(input []float32) (out float32) { switch { case hardware: _float32_max(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return max(input) } } // AddFloat32s adds input1 to input2 and writes back the result into dst slice func AddFloat32s(dst, input1, input2 []float32) []float32 { switch { case hardware: _float32_add(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return add(dst, input1, input2) } // SubFloat32s subtracts input2 from input1 and writes back the result into dst slice func SubFloat32s(dst, input1, input2 []float32) []float32 { switch { case hardware: _float32_sub(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return sub(dst, input1, input2) } // MulFloat32s multiplies input1 by input2 and writes back the result into dst slice func MulFloat32s(dst, input1, input2 []float32) []float32 { switch { case hardware: _float32_mul(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return mul(dst, input1, input2) } // DivFloat32s divides input1 by input2 and writes back the result into dst slice func DivFloat32s(dst, input1, input2 []float32) []float32 { switch { case hardware: _float32_div(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return div(dst, input1, input2) } // ---------------------------------- Float64 ---------------------------------- // SumFloat64s sums up all of the elements of the slice and returns the value func SumFloat64s(input []float64) (out float64) { switch { case hardware: _float64_sum(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return sum(input) } } // MinFloat64s returns the smallest element value in the slice func MinFloat64s(input []float64) (out float64) { switch { case hardware: _float64_min(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return min(input) } } // MaxFloat64s returns the largest element value in the slice func MaxFloat64s(input []float64) (out float64) { switch { case hardware: _float64_max(unsafe.Pointer(&input[0]), unsafe.Pointer(&out), uint64(len(input))) return default: return max(input) } } // AddFloat64s adds input1 to input2 and writes back the result into dst slice func AddFloat64s(dst, input1, input2 []float64) []float64 { switch { case hardware: _float64_add(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return add(dst, input1, input2) } // SubFloat64s subtracts input2 from input1 and writes back the result into dst slice func SubFloat64s(dst, input1, input2 []float64) []float64 { switch { case hardware: _float64_sub(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return sub(dst, input1, input2) } // MulFloat64s multiplies input1 by input2 and writes back the result into dst slice func MulFloat64s(dst, input1, input2 []float64) []float64 { switch { case hardware: _float64_mul(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return mul(dst, input1, input2) } // DivFloat64s divides input1 by input2 and writes back the result into dst slice func DivFloat64s(dst, input1, input2 []float64) []float64 { switch { case hardware: _float64_div(unsafe.Pointer(&input1[0]), unsafe.Pointer(&input2[0]), unsafe.Pointer(&(dst)[0]), uint64(len(dst))) return dst } return div(dst, input1, input2) } golang-github-kelindar-simd-1.2.0/simd_neon_arm64.go000066400000000000000000000200741517522302000222750ustar00rootroot00000000000000//go:build !noasm && !darwin && arm64 // AUTO-GENERATED BY GOCC -- DO NOT EDIT package simd import "unsafe" //go:nosplit //go:noescape func _uint8_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint8_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint16_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _uint64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int8_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int16_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _int64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) //go:nosplit //go:noescape func _float64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) golang-github-kelindar-simd-1.2.0/simd_neon_arm64.s000066400000000000000000004101771517522302000221410ustar00rootroot00000000000000//go:build !noasm && !darwin && arm64 // AUTO-GENERATED BY GOCC -- DO NOT EDIT TEXT ·_uint8_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt .LBB0_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540000c2 // b.hs .LBB0_4 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x2a1f03ea // mov w10, wzr WORD $0x14000028 // b .LBB0_13 LBB0_3: WORD $0x3900003f // strb wzr, [x1] WORD $0xd65f03c0 // ret LBB0_4: WORD $0xf100811f // cmp x8, #32 WORD $0x54000082 // b.hs .LBB0_6 WORD $0x2a1f03ea // mov w10, wzr WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000013 // b .LBB0_10 LBB0_6: WORD $0x9240104b // and x11, x2, #0x1f WORD $0x9100400a // add x10, x0, #16 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x6f00e400 // movi v0.2d, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi v1.2d, #0000000000000000 LBB0_7: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x9100814a // add x10, x10, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x4e208440 // add v0.16b, v2.16b, v0.16b WORD $0x4e218461 // add v1.16b, v3.16b, v1.16b WORD $0x54ffff61 // b.ne .LBB0_7 WORD $0x4e208420 // add v0.16b, v1.16b, v0.16b WORD $0x4e31b800 // addv b0, v0.16b WORD $0x1e26000a // fmov w10, s0 WORD $0xb40002eb // cbz x11, .LBB0_15 WORD $0xf100217f // cmp x11, #8 WORD $0x540001e3 // b.lo .LBB0_13 LBB0_10: WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0x9240084b // and x11, x2, #0x7 WORD $0x8b0b012c // add x12, x9, x11 WORD $0xcb08018c // sub x12, x12, x8 WORD $0x4e011d40 // mov v0.b[0], w10 WORD $0x8b09000a // add x10, x0, x9 WORD $0xcb0b0109 // sub x9, x8, x11 LBB0_11: WORD $0xfc408541 // ldr d1, [x10], #8 WORD $0xb100218c // adds x12, x12, #8 WORD $0x0e208420 // add v0.8b, v1.8b, v0.8b WORD $0x54ffffa1 // b.ne .LBB0_11 WORD $0x0e31b800 // addv b0, v0.8b WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, .LBB0_15 LBB0_13: WORD $0x8b09000b // add x11, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 LBB0_14: WORD $0x38401569 // ldrb w9, [x11], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0a012a // add w10, w9, w10 WORD $0x54ffffa1 // b.ne .LBB0_14 LBB0_15: WORD $0x3900002a // strb w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint8_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x39400009 // ldrb w9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400062b // b.lt .LBB1_14 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB1_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000024 // b .LBB1_12 LBB1_3: WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs .LBB1_5 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000013 // b .LBB1_9 LBB1_5: WORD $0x9240104b // and x11, x2, #0x1f WORD $0x4e010d20 // dup v0.16b, w9 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB1_6: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x91008129 // add x9, x9, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x6e206c40 // umin v0.16b, v2.16b, v0.16b WORD $0x6e216c61 // umin v1.16b, v3.16b, v1.16b WORD $0x54ffff61 // b.ne .LBB1_6 WORD $0x6e216c00 // umin v0.16b, v0.16b, v1.16b WORD $0x6e31a800 // uminv b0, v0.16b WORD $0x1e260009 // fmov w9, s0 WORD $0xb400030b // cbz x11, .LBB1_14 WORD $0xf100217f // cmp x11, #8 WORD $0x540001c3 // b.lo .LBB1_12 LBB1_9: WORD $0x9240084b // and x11, x2, #0x7 WORD $0x0e010d20 // dup v0.8b, w9 WORD $0x8b0b014c // add x12, x10, x11 WORD $0x8b0a0009 // add x9, x0, x10 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0xcb08018c // sub x12, x12, x8 LBB1_10: WORD $0xfc408521 // ldr d1, [x9], #8 WORD $0xb100218c // adds x12, x12, #8 WORD $0x2e206c20 // umin v0.8b, v1.8b, v0.8b WORD $0x54ffffa1 // b.ne .LBB1_10 WORD $0x2e31a800 // uminv b0, v0.8b WORD $0x1e260009 // fmov w9, s0 WORD $0xb400012b // cbz x11, .LBB1_14 LBB1_12: WORD $0x8b0a000b // add x11, x0, x10 WORD $0xcb0a0108 // sub x8, x8, x10 LBB1_13: WORD $0x3840156a // ldrb w10, [x11], #1 WORD $0x12001d29 // and w9, w9, #0xff WORD $0x6b09015f // cmp w10, w9 WORD $0x1a893149 // csel w9, w10, w9, lo WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne .LBB1_13 LBB1_14: WORD $0x39000029 // strb w9, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint8_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x39400009 // ldrb w9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400062b // b.lt .LBB2_14 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB2_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000024 // b .LBB2_12 LBB2_3: WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs .LBB2_5 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000013 // b .LBB2_9 LBB2_5: WORD $0x9240104b // and x11, x2, #0x1f WORD $0x4e010d20 // dup v0.16b, w9 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB2_6: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x91008129 // add x9, x9, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x6e206440 // umax v0.16b, v2.16b, v0.16b WORD $0x6e216461 // umax v1.16b, v3.16b, v1.16b WORD $0x54ffff61 // b.ne .LBB2_6 WORD $0x6e216400 // umax v0.16b, v0.16b, v1.16b WORD $0x6e30a800 // umaxv b0, v0.16b WORD $0x1e260009 // fmov w9, s0 WORD $0xb400030b // cbz x11, .LBB2_14 WORD $0xf100217f // cmp x11, #8 WORD $0x540001c3 // b.lo .LBB2_12 LBB2_9: WORD $0x9240084b // and x11, x2, #0x7 WORD $0x0e010d20 // dup v0.8b, w9 WORD $0x8b0b014c // add x12, x10, x11 WORD $0x8b0a0009 // add x9, x0, x10 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0xcb08018c // sub x12, x12, x8 LBB2_10: WORD $0xfc408521 // ldr d1, [x9], #8 WORD $0xb100218c // adds x12, x12, #8 WORD $0x2e206420 // umax v0.8b, v1.8b, v0.8b WORD $0x54ffffa1 // b.ne .LBB2_10 WORD $0x2e30a800 // umaxv b0, v0.8b WORD $0x1e260009 // fmov w9, s0 WORD $0xb400012b // cbz x11, .LBB2_14 LBB2_12: WORD $0x8b0a000b // add x11, x0, x10 WORD $0xcb0a0108 // sub x8, x8, x10 LBB2_13: WORD $0x3840156a // ldrb w10, [x11], #1 WORD $0x12001d29 // and w9, w9, #0xff WORD $0x6b09015f // cmp w10, w9 WORD $0x1a898149 // csel w9, w10, w9, hi WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne .LBB2_13 LBB2_14: WORD $0x39000029 // strb w9, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint8_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt .LBB3_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs .LBB3_6 WORD $0xaa1f03e9 // mov x9, xzr LBB3_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 LBB3_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0x38001549 // strb w9, [x10], #1 WORD $0x54ffff61 // b.ne .LBB3_4 LBB3_5: WORD $0xd65f03c0 // ret LBB3_6: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo .LBB3_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo .LBB3_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs .LBB3_10 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000014 // b .LBB3_14 LBB3_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB3_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4e208440 // add v0.16b, v2.16b, v0.16b WORD $0x4e218461 // add v1.16b, v3.16b, v1.16b WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB3_11 WORD $0xb4fffc8a // cbz x10, .LBB3_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo .LBB3_3 LBB3_14: WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b0a012e // add x14, x9, x10 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xcb0801ce // sub x14, x14, x8 LBB3_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x0e208420 // add v0.8b, v1.8b, v0.8b WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0x54ffff61 // b.ne .LBB3_15 WORD $0xb5fff94a // cbnz x10, .LBB3_3 WORD $0x17ffffd3 // b .LBB3_5 TEXT ·_uint8_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt .LBB4_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs .LBB4_6 WORD $0xaa1f03e9 // mov x9, xzr LBB4_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 LBB4_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0x38001549 // strb w9, [x10], #1 WORD $0x54ffff61 // b.ne .LBB4_4 LBB4_5: WORD $0xd65f03c0 // ret LBB4_6: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo .LBB4_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo .LBB4_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs .LBB4_10 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000014 // b .LBB4_14 LBB4_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB4_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x6e228400 // sub v0.16b, v0.16b, v2.16b WORD $0x6e238421 // sub v1.16b, v1.16b, v3.16b WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB4_11 WORD $0xb4fffc8a // cbz x10, .LBB4_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo .LBB4_3 LBB4_14: WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b0a012e // add x14, x9, x10 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xcb0801ce // sub x14, x14, x8 LBB4_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x2e218400 // sub v0.8b, v0.8b, v1.8b WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0x54ffff61 // b.ne .LBB4_15 WORD $0xb5fff94a // cbnz x10, .LBB4_3 WORD $0x17ffffd3 // b .LBB4_5 TEXT ·_uint8_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt .LBB5_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs .LBB5_6 WORD $0xaa1f03e9 // mov x9, xzr LBB5_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 LBB5_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0x38001549 // strb w9, [x10], #1 WORD $0x54ffff61 // b.ne .LBB5_4 LBB5_5: WORD $0xd65f03c0 // ret LBB5_6: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo .LBB5_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo .LBB5_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs .LBB5_10 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000014 // b .LBB5_14 LBB5_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB5_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4e209c40 // mul v0.16b, v2.16b, v0.16b WORD $0x4e219c61 // mul v1.16b, v3.16b, v1.16b WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB5_11 WORD $0xb4fffc8a // cbz x10, .LBB5_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo .LBB5_3 LBB5_14: WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b0a012e // add x14, x9, x10 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xcb0801ce // sub x14, x14, x8 LBB5_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x0e209c20 // mul v0.8b, v1.8b, v0.8b WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0x54ffff61 // b.ne .LBB5_15 WORD $0xb5fff94a // cbnz x10, .LBB5_3 WORD $0x17ffffd3 // b .LBB5_5 TEXT ·_uint8_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0xf81f0ff3 // str x19, [sp, #-16]! WORD $0x7100047f // cmp w3, #1 WORD $0x54000c6b // b.lt .LBB6_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs .LBB6_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000054 // b .LBB6_8 LBB6_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000a03 // b.lo .LBB6_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540009a3 // b.lo .LBB6_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0xaa0203eb // mov x11, x2 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0903ec // mov x12, x9 WORD $0xaa0003ee // mov x14, x0 LBB6_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0xf100418c // subs x12, x12, #16 WORD $0x0e033c10 // umov w16, v0.b[1] WORD $0x0e013c11 // umov w17, v0.b[0] WORD $0x0e033c2f // umov w15, v1.b[1] WORD $0x0e053c12 // umov w18, v0.b[2] WORD $0x0e073c03 // umov w3, v0.b[3] WORD $0x0e093c04 // umov w4, v0.b[4] WORD $0x0e0b3c05 // umov w5, v0.b[5] WORD $0x0e0d3c06 // umov w6, v0.b[6] WORD $0x0e0f3c07 // umov w7, v0.b[7] WORD $0x0e113c13 // umov w19, v0.b[8] WORD $0x1acf0a0f // udiv w15, w16, w15 WORD $0x0e013c30 // umov w16, v1.b[0] WORD $0x1ad00a30 // udiv w16, w17, w16 WORD $0x0e053c31 // umov w17, v1.b[2] WORD $0x1ad10a51 // udiv w17, w18, w17 WORD $0x0e073c32 // umov w18, v1.b[3] WORD $0x1e270202 // fmov s2, w16 WORD $0x0e133c30 // umov w16, v1.b[9] WORD $0x4e031de2 // mov v2.b[1], w15 WORD $0x1ad20872 // udiv w18, w3, w18 WORD $0x0e093c23 // umov w3, v1.b[4] WORD $0x4e051e22 // mov v2.b[2], w17 WORD $0x0e153c11 // umov w17, v0.b[10] WORD $0x1ac30883 // udiv w3, w4, w3 WORD $0x0e0b3c24 // umov w4, v1.b[5] WORD $0x4e071e42 // mov v2.b[3], w18 WORD $0x0e173c12 // umov w18, v0.b[11] WORD $0x1ac408a4 // udiv w4, w5, w4 WORD $0x0e0d3c25 // umov w5, v1.b[6] WORD $0x4e091c62 // mov v2.b[4], w3 WORD $0x0e193c03 // umov w3, v0.b[12] WORD $0x1ac508c5 // udiv w5, w6, w5 WORD $0x0e0f3c26 // umov w6, v1.b[7] WORD $0x4e0b1c82 // mov v2.b[5], w4 WORD $0x0e1b3c04 // umov w4, v0.b[13] WORD $0x1ac608e6 // udiv w6, w7, w6 WORD $0x0e113c27 // umov w7, v1.b[8] WORD $0x4e0d1ca2 // mov v2.b[6], w5 WORD $0x1ac70a67 // udiv w7, w19, w7 WORD $0x0e133c13 // umov w19, v0.b[9] WORD $0x4e0f1cc2 // mov v2.b[7], w6 WORD $0x1ad00a6f // udiv w15, w19, w16 WORD $0x0e153c30 // umov w16, v1.b[10] WORD $0x4e111ce2 // mov v2.b[8], w7 WORD $0x1ad00a30 // udiv w16, w17, w16 WORD $0x0e173c31 // umov w17, v1.b[11] WORD $0x4e131de2 // mov v2.b[9], w15 WORD $0x1ad10a51 // udiv w17, w18, w17 WORD $0x0e193c32 // umov w18, v1.b[12] WORD $0x4e151e02 // mov v2.b[10], w16 WORD $0x0e1d3c30 // umov w16, v1.b[14] WORD $0x1ad20872 // udiv w18, w3, w18 WORD $0x0e1b3c23 // umov w3, v1.b[13] WORD $0x4e171e22 // mov v2.b[11], w17 WORD $0x0e1f3c31 // umov w17, v1.b[15] WORD $0x1ac3088f // udiv w15, w4, w3 WORD $0x0e1d3c03 // umov w3, v0.b[14] WORD $0x4e191e42 // mov v2.b[12], w18 WORD $0x0e1f3c12 // umov w18, v0.b[15] WORD $0x1ad00870 // udiv w16, w3, w16 WORD $0x4e1b1de2 // mov v2.b[13], w15 WORD $0x1ad10a4f // udiv w15, w18, w17 WORD $0x4e1d1e02 // mov v2.b[14], w16 WORD $0x4e1f1de2 // mov v2.b[15], w15 WORD $0x3c810562 // str q2, [x11], #16 WORD $0x54fff781 // b.ne .LBB6_6 WORD $0xb400016a // cbz x10, .LBB6_10 LBB6_8: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 LBB6_9: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1acd0929 // udiv w9, w9, w13 WORD $0x38001549 // strb w9, [x10], #1 WORD $0x54ffff61 // b.ne .LBB6_9 LBB6_10: WORD $0xf84107f3 // ldr x19, [sp], #16 WORD $0xd65f03c0 // ret TEXT ·_uint16_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt .LBB7_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x540000c2 // b.hs .LBB7_4 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x2a1f03ea // mov w10, wzr WORD $0x14000013 // b .LBB7_7 LBB7_3: WORD $0x7900003f // strh wzr, [x1] WORD $0xd65f03c0 // ret LBB7_4: WORD $0x92400c4b // and x11, x2, #0xf WORD $0x9100400a // add x10, x0, #16 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x6f00e400 // movi v0.2d, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi v1.2d, #0000000000000000 LBB7_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x9100814a // add x10, x10, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x4e608440 // add v0.8h, v2.8h, v0.8h WORD $0x4e618461 // add v1.8h, v3.8h, v1.8h WORD $0x54ffff61 // b.ne .LBB7_5 WORD $0x4e608420 // add v0.8h, v1.8h, v0.8h WORD $0x4e71b800 // addv h0, v0.8h WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, .LBB7_9 LBB7_7: WORD $0x8b09040b // add x11, x0, x9, lsl #1 WORD $0xcb090108 // sub x8, x8, x9 LBB7_8: WORD $0x78402569 // ldrh w9, [x11], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0a012a // add w10, w9, w10 WORD $0x54ffffa1 // b.ne .LBB7_8 LBB7_9: WORD $0x7900002a // strh w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint16_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x79400008 // ldrh w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003cb // b.lt .LBB8_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100413f // cmp x9, #16 WORD $0x54000062 // b.hs .LBB8_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000011 // b .LBB8_6 LBB8_3: WORD $0x92400c4b // and x11, x2, #0xf WORD $0x4e020d00 // dup v0.8h, w8 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB8_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x91008108 // add x8, x8, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x6e606c40 // umin v0.8h, v2.8h, v0.8h WORD $0x6e616c61 // umin v1.8h, v3.8h, v1.8h WORD $0x54ffff61 // b.ne .LBB8_4 WORD $0x6e616c00 // umin v0.8h, v0.8h, v1.8h WORD $0x6e71a800 // uminv h0, v0.8h WORD $0x1e260008 // fmov w8, s0 WORD $0xb400012b // cbz x11, .LBB8_8 LBB8_6: WORD $0x8b0a040b // add x11, x0, x10, lsl #1 WORD $0xcb0a0129 // sub x9, x9, x10 LBB8_7: WORD $0x7840256a // ldrh w10, [x11], #2 WORD $0x12003d08 // and w8, w8, #0xffff WORD $0x6b08015f // cmp w10, w8 WORD $0x1a883148 // csel w8, w10, w8, lo WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff61 // b.ne .LBB8_7 LBB8_8: WORD $0x79000028 // strh w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint16_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x79400008 // ldrh w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003cb // b.lt .LBB9_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100413f // cmp x9, #16 WORD $0x54000062 // b.hs .LBB9_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000011 // b .LBB9_6 LBB9_3: WORD $0x92400c4b // and x11, x2, #0xf WORD $0x4e020d00 // dup v0.8h, w8 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB9_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x91008108 // add x8, x8, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x6e606440 // umax v0.8h, v2.8h, v0.8h WORD $0x6e616461 // umax v1.8h, v3.8h, v1.8h WORD $0x54ffff61 // b.ne .LBB9_4 WORD $0x6e616400 // umax v0.8h, v0.8h, v1.8h WORD $0x6e70a800 // umaxv h0, v0.8h WORD $0x1e260008 // fmov w8, s0 WORD $0xb400012b // cbz x11, .LBB9_8 LBB9_6: WORD $0x8b0a040b // add x11, x0, x10, lsl #1 WORD $0xcb0a0129 // sub x9, x9, x10 LBB9_7: WORD $0x7840256a // ldrh w10, [x11], #2 WORD $0x12003d08 // and w8, w8, #0xffff WORD $0x6b08015f // cmp w10, w8 WORD $0x1a888148 // csel w8, w10, w8, hi WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff61 // b.ne .LBB9_7 LBB9_8: WORD $0x79000028 // strh w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint16_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB10_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs .LBB10_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB10_8 LBB10_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB10_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB10_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB10_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4e608440 // add v0.8h, v2.8h, v0.8h WORD $0x4e618461 // add v1.8h, v3.8h, v1.8h WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB10_6 WORD $0xb400018a // cbz x10, .LBB10_10 LBB10_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB10_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0x78002549 // strh w9, [x10], #2 WORD $0x54ffff61 // b.ne .LBB10_9 LBB10_10: WORD $0xd65f03c0 // ret TEXT ·_uint16_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB11_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs .LBB11_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB11_8 LBB11_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB11_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB11_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB11_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x6e628400 // sub v0.8h, v0.8h, v2.8h WORD $0x6e638421 // sub v1.8h, v1.8h, v3.8h WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB11_6 WORD $0xb400018a // cbz x10, .LBB11_10 LBB11_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB11_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0x78002549 // strh w9, [x10], #2 WORD $0x54ffff61 // b.ne .LBB11_9 LBB11_10: WORD $0xd65f03c0 // ret TEXT ·_uint16_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB12_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs .LBB12_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB12_8 LBB12_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB12_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB12_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB12_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4e609c40 // mul v0.8h, v2.8h, v0.8h WORD $0x4e619c61 // mul v1.8h, v3.8h, v1.8h WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB12_6 WORD $0xb400018a // cbz x10, .LBB12_10 LBB12_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB12_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0x78002549 // strh w9, [x10], #2 WORD $0x54ffff61 // b.ne .LBB12_9 LBB12_10: WORD $0xd65f03c0 // ret TEXT ·_uint16_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400088b // b.lt .LBB13_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB13_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000034 // b .LBB13_8 LBB13_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000603 // b.lo .LBB13_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540005a3 // b.lo .LBB13_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xaa0203eb // mov x11, x2 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0903ec // mov x12, x9 WORD $0xaa0003ee // mov x14, x0 LBB13_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0xf100218c // subs x12, x12, #8 WORD $0x0e063c10 // umov w16, v0.h[1] WORD $0x0e023c11 // umov w17, v0.h[0] WORD $0x0e063c2f // umov w15, v1.h[1] WORD $0x0e0a3c12 // umov w18, v0.h[2] WORD $0x0e0e3c03 // umov w3, v0.h[3] WORD $0x0e123c04 // umov w4, v0.h[4] WORD $0x1acf0a0f // udiv w15, w16, w15 WORD $0x0e023c30 // umov w16, v1.h[0] WORD $0x1ad00a30 // udiv w16, w17, w16 WORD $0x0e0a3c31 // umov w17, v1.h[2] WORD $0x1ad10a51 // udiv w17, w18, w17 WORD $0x0e0e3c32 // umov w18, v1.h[3] WORD $0x1e270202 // fmov s2, w16 WORD $0x0e163c30 // umov w16, v1.h[5] WORD $0x4e061de2 // mov v2.h[1], w15 WORD $0x1ad20872 // udiv w18, w3, w18 WORD $0x0e123c23 // umov w3, v1.h[4] WORD $0x4e0a1e22 // mov v2.h[2], w17 WORD $0x0e1a3c11 // umov w17, v0.h[6] WORD $0x1ac30883 // udiv w3, w4, w3 WORD $0x0e163c04 // umov w4, v0.h[5] WORD $0x4e0e1e42 // mov v2.h[3], w18 WORD $0x0e1e3c12 // umov w18, v0.h[7] WORD $0x1ad0088f // udiv w15, w4, w16 WORD $0x0e1a3c30 // umov w16, v1.h[6] WORD $0x4e121c62 // mov v2.h[4], w3 WORD $0x1ad00a30 // udiv w16, w17, w16 WORD $0x0e1e3c31 // umov w17, v1.h[7] WORD $0x4e161de2 // mov v2.h[5], w15 WORD $0x1ad10a4f // udiv w15, w18, w17 WORD $0x4e1a1e02 // mov v2.h[6], w16 WORD $0x4e1e1de2 // mov v2.h[7], w15 WORD $0x3c810562 // str q2, [x11], #16 WORD $0x54fffb81 // b.ne .LBB13_6 WORD $0xb400018a // cbz x10, .LBB13_10 LBB13_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB13_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1acd0929 // udiv w9, w9, w13 WORD $0x78002549 // strh w9, [x10], #2 WORD $0x54ffff61 // b.ne .LBB13_9 LBB13_10: WORD $0xd65f03c0 // ret TEXT ·_uint32_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt .LBB14_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540000c2 // b.hs .LBB14_4 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x2a1f03ea // mov w10, wzr WORD $0x14000013 // b .LBB14_7 LBB14_3: WORD $0xb900003f // str wzr, [x1] WORD $0xd65f03c0 // ret LBB14_4: WORD $0x9240084b // and x11, x2, #0x7 WORD $0x9100400a // add x10, x0, #16 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x6f00e400 // movi v0.2d, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi v1.2d, #0000000000000000 LBB14_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x9100814a // add x10, x10, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x4ea08440 // add v0.4s, v2.4s, v0.4s WORD $0x4ea18461 // add v1.4s, v3.4s, v1.4s WORD $0x54ffff61 // b.ne .LBB14_5 WORD $0x4ea08420 // add v0.4s, v1.4s, v0.4s WORD $0x4eb1b800 // addv s0, v0.4s WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, .LBB14_9 LBB14_7: WORD $0x8b09080b // add x11, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 LBB14_8: WORD $0xb8404569 // ldr w9, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0a012a // add w10, w9, w10 WORD $0x54ffffa1 // b.ne .LBB14_8 LBB14_9: WORD $0xb900002a // str w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint32_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xb9400008 // ldr w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003ab // b.lt .LBB15_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100213f // cmp x9, #8 WORD $0x54000062 // b.hs .LBB15_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000011 // b .LBB15_6 LBB15_3: WORD $0x9240084b // and x11, x2, #0x7 WORD $0x4e040d00 // dup v0.4s, w8 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB15_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x91008108 // add x8, x8, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x6ea06c40 // umin v0.4s, v2.4s, v0.4s WORD $0x6ea16c61 // umin v1.4s, v3.4s, v1.4s WORD $0x54ffff61 // b.ne .LBB15_4 WORD $0x6ea16c00 // umin v0.4s, v0.4s, v1.4s WORD $0x6eb1a800 // uminv s0, v0.4s WORD $0x1e260008 // fmov w8, s0 WORD $0xb400010b // cbz x11, .LBB15_8 LBB15_6: WORD $0x8b0a080b // add x11, x0, x10, lsl #2 WORD $0xcb0a0129 // sub x9, x9, x10 LBB15_7: WORD $0xb840456a // ldr w10, [x11], #4 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a883148 // csel w8, w10, w8, lo WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff81 // b.ne .LBB15_7 LBB15_8: WORD $0xb9000028 // str w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint32_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xb9400008 // ldr w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003ab // b.lt .LBB16_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100213f // cmp x9, #8 WORD $0x54000062 // b.hs .LBB16_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000011 // b .LBB16_6 LBB16_3: WORD $0x9240084b // and x11, x2, #0x7 WORD $0x4e040d00 // dup v0.4s, w8 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB16_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x91008108 // add x8, x8, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x6ea06440 // umax v0.4s, v2.4s, v0.4s WORD $0x6ea16461 // umax v1.4s, v3.4s, v1.4s WORD $0x54ffff61 // b.ne .LBB16_4 WORD $0x6ea16400 // umax v0.4s, v0.4s, v1.4s WORD $0x6eb0a800 // umaxv s0, v0.4s WORD $0x1e260008 // fmov w8, s0 WORD $0xb400010b // cbz x11, .LBB16_8 LBB16_6: WORD $0x8b0a080b // add x11, x0, x10, lsl #2 WORD $0xcb0a0129 // sub x9, x9, x10 LBB16_7: WORD $0xb840456a // ldr w10, [x11], #4 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a888148 // csel w8, w10, w8, hi WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff81 // b.ne .LBB16_7 LBB16_8: WORD $0xb9000028 // str w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint32_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB17_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB17_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB17_8 LBB17_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB17_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB17_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB17_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4ea08440 // add v0.4s, v2.4s, v0.4s WORD $0x4ea18461 // add v1.4s, v3.4s, v1.4s WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB17_6 WORD $0xb400018a // cbz x10, .LBB17_10 LBB17_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB17_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0xb8004549 // str w9, [x10], #4 WORD $0x54ffff61 // b.ne .LBB17_9 LBB17_10: WORD $0xd65f03c0 // ret TEXT ·_uint32_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB18_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB18_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB18_8 LBB18_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB18_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB18_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB18_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x6ea28400 // sub v0.4s, v0.4s, v2.4s WORD $0x6ea38421 // sub v1.4s, v1.4s, v3.4s WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB18_6 WORD $0xb400018a // cbz x10, .LBB18_10 LBB18_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB18_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0xb8004549 // str w9, [x10], #4 WORD $0x54ffff61 // b.ne .LBB18_9 LBB18_10: WORD $0xd65f03c0 // ret TEXT ·_uint32_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB19_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB19_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB19_8 LBB19_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB19_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB19_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB19_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4ea09c40 // mul v0.4s, v2.4s, v0.4s WORD $0x4ea19c61 // mul v1.4s, v3.4s, v1.4s WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB19_6 WORD $0xb400018a // cbz x10, .LBB19_10 LBB19_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB19_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0xb8004549 // str w9, [x10], #4 WORD $0x54ffff61 // b.ne .LBB19_9 LBB19_10: WORD $0xd65f03c0 // ret TEXT ·_uint32_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400068b // b.lt .LBB20_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB20_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000024 // b .LBB20_8 LBB20_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000403 // b.lo .LBB20_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540003a3 // b.lo .LBB20_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xaa0203eb // mov x11, x2 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0903ec // mov x12, x9 WORD $0xaa0003ee // mov x14, x0 LBB20_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0xf100118c // subs x12, x12, #4 WORD $0x0e0c3c10 // mov w16, v0.s[1] WORD $0x1e260011 // fmov w17, s0 WORD $0x0e0c3c2f // mov w15, v1.s[1] WORD $0x0e143c12 // mov w18, v0.s[2] WORD $0x0e1c3c03 // mov w3, v0.s[3] WORD $0x1acf0a0f // udiv w15, w16, w15 WORD $0x1e260030 // fmov w16, s1 WORD $0x1ad00a30 // udiv w16, w17, w16 WORD $0x0e143c31 // mov w17, v1.s[2] WORD $0x1ad10a51 // udiv w17, w18, w17 WORD $0x0e1c3c32 // mov w18, v1.s[3] WORD $0x1e270200 // fmov s0, w16 WORD $0x4e0c1de0 // mov v0.s[1], w15 WORD $0x1ad2086f // udiv w15, w3, w18 WORD $0x4e141e20 // mov v0.s[2], w17 WORD $0x4e1c1de0 // mov v0.s[3], w15 WORD $0x3c810560 // str q0, [x11], #16 WORD $0x54fffd81 // b.ne .LBB20_6 WORD $0xb400018a // cbz x10, .LBB20_10 LBB20_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB20_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1acd0929 // udiv w9, w9, w13 WORD $0xb8004549 // str w9, [x10], #4 WORD $0x54ffff61 // b.ne .LBB20_9 LBB20_10: WORD $0xd65f03c0 // ret TEXT ·_uint64_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt .LBB21_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x540000c2 // b.hs .LBB21_4 WORD $0xaa1f03e9 // mov x9, xzr WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000013 // b .LBB21_7 LBB21_3: WORD $0xf900003f // str xzr, [x1] WORD $0xd65f03c0 // ret LBB21_4: WORD $0x9240044b // and x11, x2, #0x3 WORD $0x9100400a // add x10, x0, #16 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x6f00e400 // movi v0.2d, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi v1.2d, #0000000000000000 LBB21_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x9100814a // add x10, x10, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x4ee08440 // add v0.2d, v2.2d, v0.2d WORD $0x4ee18461 // add v1.2d, v3.2d, v1.2d WORD $0x54ffff61 // b.ne .LBB21_5 WORD $0x4ee08420 // add v0.2d, v1.2d, v0.2d WORD $0x5ef1b800 // addp d0, v0.2d WORD $0x9e66000a // fmov x10, d0 WORD $0xb40000eb // cbz x11, .LBB21_9 LBB21_7: WORD $0x8b090c0b // add x11, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 LBB21_8: WORD $0xf8408569 // ldr x9, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x8b0a012a // add x10, x9, x10 WORD $0x54ffffa1 // b.ne .LBB21_8 LBB21_9: WORD $0xf900002a // str x10, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint64_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xf9400009 // ldr x9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400044b // b.lt .LBB22_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB22_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000016 // b .LBB22_6 LBB22_3: WORD $0x9240044b // and x11, x2, #0x3 WORD $0x4e080d20 // dup v0.2d, x9 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB22_4: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x91008129 // add x9, x9, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x6ee23404 // cmhi v4.2d, v0.2d, v2.2d WORD $0x6ee33425 // cmhi v5.2d, v1.2d, v3.2d WORD $0x6ea41c40 // bit v0.16b, v2.16b, v4.16b WORD $0x6ea51c61 // bit v1.16b, v3.16b, v5.16b WORD $0x54ffff21 // b.ne .LBB22_4 WORD $0x6ee03422 // cmhi v2.2d, v1.2d, v0.2d WORD $0x6ee21c20 // bif v0.16b, v1.16b, v2.16b WORD $0x6e004001 // ext v1.16b, v0.16b, v0.16b, #8 WORD $0x7ee03422 // cmhi d2, d1, d0 WORD $0x2ee21c20 // bif v0.8b, v1.8b, v2.8b WORD $0x9e660009 // fmov x9, d0 WORD $0xb400010b // cbz x11, .LBB22_8 LBB22_6: WORD $0x8b0a0c0b // add x11, x0, x10, lsl #3 WORD $0xcb0a0108 // sub x8, x8, x10 LBB22_7: WORD $0xf840856a // ldr x10, [x11], #8 WORD $0xeb09015f // cmp x10, x9 WORD $0x9a893149 // csel x9, x10, x9, lo WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff81 // b.ne .LBB22_7 LBB22_8: WORD $0xf9000029 // str x9, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint64_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xf9400009 // ldr x9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400044b // b.lt .LBB23_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB23_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000016 // b .LBB23_6 LBB23_3: WORD $0x9240044b // and x11, x2, #0x3 WORD $0x4e080d20 // dup v0.2d, x9 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB23_4: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x91008129 // add x9, x9, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x6ee03444 // cmhi v4.2d, v2.2d, v0.2d WORD $0x6ee13465 // cmhi v5.2d, v3.2d, v1.2d WORD $0x6ea41c40 // bit v0.16b, v2.16b, v4.16b WORD $0x6ea51c61 // bit v1.16b, v3.16b, v5.16b WORD $0x54ffff21 // b.ne .LBB23_4 WORD $0x6ee13402 // cmhi v2.2d, v0.2d, v1.2d WORD $0x6ee21c20 // bif v0.16b, v1.16b, v2.16b WORD $0x6e004001 // ext v1.16b, v0.16b, v0.16b, #8 WORD $0x7ee13402 // cmhi d2, d0, d1 WORD $0x2ee21c20 // bif v0.8b, v1.8b, v2.8b WORD $0x9e660009 // fmov x9, d0 WORD $0xb400010b // cbz x11, .LBB23_8 LBB23_6: WORD $0x8b0a0c0b // add x11, x0, x10, lsl #3 WORD $0xcb0a0108 // sub x8, x8, x10 LBB23_7: WORD $0xf840856a // ldr x10, [x11], #8 WORD $0xeb09015f // cmp x10, x9 WORD $0x9a898149 // csel x9, x10, x9, hi WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff81 // b.ne .LBB23_7 LBB23_8: WORD $0xf9000029 // str x9, [x1] WORD $0xd65f03c0 // ret TEXT ·_uint64_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB24_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB24_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB24_8 LBB24_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB24_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB24_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB24_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4ee08440 // add v0.2d, v2.2d, v0.2d WORD $0x4ee18461 // add v1.2d, v3.2d, v1.2d WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB24_6 WORD $0xb400018a // cbz x10, .LBB24_10 LBB24_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB24_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x8b0901a9 // add x9, x13, x9 WORD $0xf8008549 // str x9, [x10], #8 WORD $0x54ffff61 // b.ne .LBB24_9 LBB24_10: WORD $0xd65f03c0 // ret TEXT ·_uint64_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB25_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB25_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB25_8 LBB25_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB25_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB25_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB25_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x6ee28400 // sub v0.2d, v0.2d, v2.2d WORD $0x6ee38421 // sub v1.2d, v1.2d, v3.2d WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB25_6 WORD $0xb400018a // cbz x10, .LBB25_10 LBB25_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB25_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0xcb0d0129 // sub x9, x9, x13 WORD $0xf8008549 // str x9, [x10], #8 WORD $0x54ffff61 // b.ne .LBB25_9 LBB25_10: WORD $0xd65f03c0 // ret TEXT ·_uint64_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540006eb // b.lt .LBB26_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB26_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000027 // b .LBB26_8 LBB26_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54000463 // b.lo .LBB26_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000403 // b.lo .LBB26_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB26_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x9e660011 // fmov x17, d0 WORD $0x4e183c0f // mov x15, v0.d[1] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9e660024 // fmov x4, d1 WORD $0x9100818c // add x12, x12, #32 WORD $0x4e183c23 // mov x3, v1.d[1] WORD $0x9e660052 // fmov x18, d2 WORD $0x4e183c50 // mov x16, v2.d[1] WORD $0x9e660065 // fmov x5, d3 WORD $0x4e183c66 // mov x6, v3.d[1] WORD $0x9b117e51 // mul x17, x18, x17 WORD $0x9b047cb2 // mul x18, x5, x4 WORD $0x9b0f7e0f // mul x15, x16, x15 WORD $0x9b037cd0 // mul x16, x6, x3 WORD $0x9e670220 // fmov d0, x17 WORD $0x9e670241 // fmov d1, x18 WORD $0x4e181de0 // mov v0.d[1], x15 WORD $0x4e181e01 // mov v1.d[1], x16 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffd21 // b.ne .LBB26_6 WORD $0xb400018a // cbz x10, .LBB26_10 LBB26_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB26_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x9b097da9 // mul x9, x13, x9 WORD $0xf8008549 // str x9, [x10], #8 WORD $0x54ffff61 // b.ne .LBB26_9 LBB26_10: WORD $0xd65f03c0 // ret TEXT ·_uint64_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400058b // b.lt .LBB27_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100091f // cmp x8, #2 WORD $0x54000062 // b.hs .LBB27_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x1400001c // b .LBB27_8 LBB27_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000303 // b.lo .LBB27_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540002a3 // b.lo .LBB27_8 WORD $0x9240006a // and x10, x3, #0x1 WORD $0xaa0203eb // mov x11, x2 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0903ec // mov x12, x9 WORD $0xaa0003ee // mov x14, x0 LBB27_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0xf100098c // subs x12, x12, #2 WORD $0x9e660010 // fmov x16, d0 WORD $0x9e66002f // fmov x15, d1 WORD $0x4e183c11 // mov x17, v0.d[1] WORD $0x9acf0a0f // udiv x15, x16, x15 WORD $0x4e183c30 // mov x16, v1.d[1] WORD $0x9ad00a30 // udiv x16, x17, x16 WORD $0x9e6701e0 // fmov d0, x15 WORD $0x4e181e00 // mov v0.d[1], x16 WORD $0x3c810560 // str q0, [x11], #16 WORD $0x54fffe81 // b.ne .LBB27_6 WORD $0xb400018a // cbz x10, .LBB27_10 LBB27_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB27_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x9acd0929 // udiv x9, x9, x13 WORD $0xf8008549 // str x9, [x10], #8 WORD $0x54ffff61 // b.ne .LBB27_9 LBB27_10: WORD $0xd65f03c0 // ret TEXT ·_int8_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt .LBB28_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540000c2 // b.hs .LBB28_4 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x2a1f03ea // mov w10, wzr WORD $0x14000028 // b .LBB28_13 LBB28_3: WORD $0x3900003f // strb wzr, [x1] WORD $0xd65f03c0 // ret LBB28_4: WORD $0xf100811f // cmp x8, #32 WORD $0x54000082 // b.hs .LBB28_6 WORD $0x2a1f03ea // mov w10, wzr WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000013 // b .LBB28_10 LBB28_6: WORD $0x9240104b // and x11, x2, #0x1f WORD $0x9100400a // add x10, x0, #16 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x6f00e400 // movi v0.2d, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi v1.2d, #0000000000000000 LBB28_7: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x9100814a // add x10, x10, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x4e208440 // add v0.16b, v2.16b, v0.16b WORD $0x4e218461 // add v1.16b, v3.16b, v1.16b WORD $0x54ffff61 // b.ne .LBB28_7 WORD $0x4e208420 // add v0.16b, v1.16b, v0.16b WORD $0x4e31b800 // addv b0, v0.16b WORD $0x1e26000a // fmov w10, s0 WORD $0xb40002eb // cbz x11, .LBB28_15 WORD $0xf100217f // cmp x11, #8 WORD $0x540001e3 // b.lo .LBB28_13 LBB28_10: WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0x9240084b // and x11, x2, #0x7 WORD $0x8b0b012c // add x12, x9, x11 WORD $0xcb08018c // sub x12, x12, x8 WORD $0x4e011d40 // mov v0.b[0], w10 WORD $0x8b09000a // add x10, x0, x9 WORD $0xcb0b0109 // sub x9, x8, x11 LBB28_11: WORD $0xfc408541 // ldr d1, [x10], #8 WORD $0xb100218c // adds x12, x12, #8 WORD $0x0e208420 // add v0.8b, v1.8b, v0.8b WORD $0x54ffffa1 // b.ne .LBB28_11 WORD $0x0e31b800 // addv b0, v0.8b WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, .LBB28_15 LBB28_13: WORD $0x8b09000b // add x11, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 LBB28_14: WORD $0x38401569 // ldrb w9, [x11], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0a012a // add w10, w9, w10 WORD $0x54ffffa1 // b.ne .LBB28_14 LBB28_15: WORD $0x3900002a // strb w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_int8_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x39400009 // ldrb w9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400062b // b.lt .LBB29_14 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB29_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000024 // b .LBB29_12 LBB29_3: WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs .LBB29_5 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000013 // b .LBB29_9 LBB29_5: WORD $0x9240104b // and x11, x2, #0x1f WORD $0x4e010d20 // dup v0.16b, w9 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB29_6: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x91008129 // add x9, x9, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x4e206c40 // smin v0.16b, v2.16b, v0.16b WORD $0x4e216c61 // smin v1.16b, v3.16b, v1.16b WORD $0x54ffff61 // b.ne .LBB29_6 WORD $0x4e216c00 // smin v0.16b, v0.16b, v1.16b WORD $0x4e31a800 // sminv b0, v0.16b WORD $0x1e260009 // fmov w9, s0 WORD $0xb400030b // cbz x11, .LBB29_14 WORD $0xf100217f // cmp x11, #8 WORD $0x540001c3 // b.lo .LBB29_12 LBB29_9: WORD $0x9240084b // and x11, x2, #0x7 WORD $0x0e010d20 // dup v0.8b, w9 WORD $0x8b0b014c // add x12, x10, x11 WORD $0x8b0a0009 // add x9, x0, x10 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0xcb08018c // sub x12, x12, x8 LBB29_10: WORD $0xfc408521 // ldr d1, [x9], #8 WORD $0xb100218c // adds x12, x12, #8 WORD $0x0e206c20 // smin v0.8b, v1.8b, v0.8b WORD $0x54ffffa1 // b.ne .LBB29_10 WORD $0x0e31a800 // sminv b0, v0.8b WORD $0x1e260009 // fmov w9, s0 WORD $0xb400012b // cbz x11, .LBB29_14 LBB29_12: WORD $0x8b0a000b // add x11, x0, x10 WORD $0xcb0a0108 // sub x8, x8, x10 LBB29_13: WORD $0x38c0156a // ldrsb w10, [x11], #1 WORD $0x13001d29 // sxtb w9, w9 WORD $0x6b09015f // cmp w10, w9 WORD $0x1a89b149 // csel w9, w10, w9, lt WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne .LBB29_13 LBB29_14: WORD $0x39000029 // strb w9, [x1] WORD $0xd65f03c0 // ret TEXT ·_int8_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x39400009 // ldrb w9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400062b // b.lt .LBB30_14 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB30_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000024 // b .LBB30_12 LBB30_3: WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs .LBB30_5 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000013 // b .LBB30_9 LBB30_5: WORD $0x9240104b // and x11, x2, #0x1f WORD $0x4e010d20 // dup v0.16b, w9 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB30_6: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x91008129 // add x9, x9, #32 WORD $0xf100818c // subs x12, x12, #32 WORD $0x4e206440 // smax v0.16b, v2.16b, v0.16b WORD $0x4e216461 // smax v1.16b, v3.16b, v1.16b WORD $0x54ffff61 // b.ne .LBB30_6 WORD $0x4e216400 // smax v0.16b, v0.16b, v1.16b WORD $0x4e30a800 // smaxv b0, v0.16b WORD $0x1e260009 // fmov w9, s0 WORD $0xb400030b // cbz x11, .LBB30_14 WORD $0xf100217f // cmp x11, #8 WORD $0x540001c3 // b.lo .LBB30_12 LBB30_9: WORD $0x9240084b // and x11, x2, #0x7 WORD $0x0e010d20 // dup v0.8b, w9 WORD $0x8b0b014c // add x12, x10, x11 WORD $0x8b0a0009 // add x9, x0, x10 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0xcb08018c // sub x12, x12, x8 LBB30_10: WORD $0xfc408521 // ldr d1, [x9], #8 WORD $0xb100218c // adds x12, x12, #8 WORD $0x0e206420 // smax v0.8b, v1.8b, v0.8b WORD $0x54ffffa1 // b.ne .LBB30_10 WORD $0x0e30a800 // smaxv b0, v0.8b WORD $0x1e260009 // fmov w9, s0 WORD $0xb400012b // cbz x11, .LBB30_14 LBB30_12: WORD $0x8b0a000b // add x11, x0, x10 WORD $0xcb0a0108 // sub x8, x8, x10 LBB30_13: WORD $0x38c0156a // ldrsb w10, [x11], #1 WORD $0x13001d29 // sxtb w9, w9 WORD $0x6b09015f // cmp w10, w9 WORD $0x1a89c149 // csel w9, w10, w9, gt WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff61 // b.ne .LBB30_13 LBB30_14: WORD $0x39000029 // strb w9, [x1] WORD $0xd65f03c0 // ret TEXT ·_int8_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt .LBB31_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs .LBB31_6 WORD $0xaa1f03e9 // mov x9, xzr LBB31_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 LBB31_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0x38001549 // strb w9, [x10], #1 WORD $0x54ffff61 // b.ne .LBB31_4 LBB31_5: WORD $0xd65f03c0 // ret LBB31_6: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo .LBB31_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo .LBB31_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs .LBB31_10 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000014 // b .LBB31_14 LBB31_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB31_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4e208440 // add v0.16b, v2.16b, v0.16b WORD $0x4e218461 // add v1.16b, v3.16b, v1.16b WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB31_11 WORD $0xb4fffc8a // cbz x10, .LBB31_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo .LBB31_3 LBB31_14: WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b0a012e // add x14, x9, x10 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xcb0801ce // sub x14, x14, x8 LBB31_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x0e208420 // add v0.8b, v1.8b, v0.8b WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0x54ffff61 // b.ne .LBB31_15 WORD $0xb5fff94a // cbnz x10, .LBB31_3 WORD $0x17ffffd3 // b .LBB31_5 TEXT ·_int8_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt .LBB32_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs .LBB32_6 WORD $0xaa1f03e9 // mov x9, xzr LBB32_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 LBB32_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0x38001549 // strb w9, [x10], #1 WORD $0x54ffff61 // b.ne .LBB32_4 LBB32_5: WORD $0xd65f03c0 // ret LBB32_6: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo .LBB32_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo .LBB32_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs .LBB32_10 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000014 // b .LBB32_14 LBB32_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB32_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x6e228400 // sub v0.16b, v0.16b, v2.16b WORD $0x6e238421 // sub v1.16b, v1.16b, v3.16b WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB32_11 WORD $0xb4fffc8a // cbz x10, .LBB32_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo .LBB32_3 LBB32_14: WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b0a012e // add x14, x9, x10 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xcb0801ce // sub x14, x14, x8 LBB32_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x2e218400 // sub v0.8b, v0.8b, v1.8b WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0x54ffff61 // b.ne .LBB32_15 WORD $0xb5fff94a // cbnz x10, .LBB32_3 WORD $0x17ffffd3 // b .LBB32_5 TEXT ·_int8_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540001eb // b.lt .LBB33_5 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540001a2 // b.hs .LBB33_6 WORD $0xaa1f03e9 // mov x9, xzr LBB33_3: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 LBB33_4: WORD $0x38401589 // ldrb w9, [x12], #1 WORD $0x3840156d // ldrb w13, [x11], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0x38001549 // strb w9, [x10], #1 WORD $0x54ffff61 // b.ne .LBB33_4 LBB33_5: WORD $0xd65f03c0 // ret LBB33_6: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffe43 // b.lo .LBB33_3 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54fffde3 // b.lo .LBB33_3 WORD $0xf100811f // cmp x8, #32 WORD $0x54000062 // b.hs .LBB33_10 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000014 // b .LBB33_14 LBB33_10: WORD $0x9240106a // and x10, x3, #0x1f WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB33_11: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10081ce // subs x14, x14, #32 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4e209c40 // mul v0.16b, v2.16b, v0.16b WORD $0x4e219c61 // mul v1.16b, v3.16b, v1.16b WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB33_11 WORD $0xb4fffc8a // cbz x10, .LBB33_5 WORD $0xf100215f // cmp x10, #8 WORD $0x54fffb03 // b.lo .LBB33_3 LBB33_14: WORD $0x9240086a // and x10, x3, #0x7 WORD $0x8b09000b // add x11, x0, x9 WORD $0x8b0a012e // add x14, x9, x10 WORD $0x8b09002c // add x12, x1, x9 WORD $0x8b09004d // add x13, x2, x9 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xcb0801ce // sub x14, x14, x8 LBB33_15: WORD $0xfc408560 // ldr d0, [x11], #8 WORD $0xfc408581 // ldr d1, [x12], #8 WORD $0xb10021ce // adds x14, x14, #8 WORD $0x0e209c20 // mul v0.8b, v1.8b, v0.8b WORD $0xfc0085a0 // str d0, [x13], #8 WORD $0x54ffff61 // b.ne .LBB33_15 WORD $0xb5fff94a // cbnz x10, .LBB33_3 WORD $0x17ffffd3 // b .LBB33_5 TEXT ·_int8_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0xf81d0ff7 // str x23, [sp, #-48]! WORD $0x7100047f // cmp w3, #1 WORD $0xa90157f6 // stp x22, x21, [sp, #16] WORD $0xa9024ff4 // stp x20, x19, [sp, #32] WORD $0x54000d0b // b.lt .LBB34_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs .LBB34_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000059 // b .LBB34_8 LBB34_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000aa3 // b.lo .LBB34_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x54000a43 // b.lo .LBB34_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0xaa0203eb // mov x11, x2 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0903ec // mov x12, x9 WORD $0xaa0003ee // mov x14, x0 LBB34_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0xf100418c // subs x12, x12, #16 WORD $0x4f08a402 // sshll2 v2.8h, v0.16b, #0 WORD $0x4f08a423 // sshll2 v3.8h, v1.16b, #0 WORD $0x0e062c50 // smov w16, v2.h[1] WORD $0x0e062c6f // smov w15, v3.h[1] WORD $0x0e022c51 // smov w17, v2.h[0] WORD $0x0e0a2c52 // smov w18, v2.h[2] WORD $0x0e0e2c43 // smov w3, v2.h[3] WORD $0x0e122c44 // smov w4, v2.h[4] WORD $0x0e162c45 // smov w5, v2.h[5] WORD $0x0e1a2c46 // smov w6, v2.h[6] WORD $0x0e1e2c47 // smov w7, v2.h[7] WORD $0x1acf0e0f // sdiv w15, w16, w15 WORD $0x0e022c70 // smov w16, v3.h[0] WORD $0x0f08a400 // sshll v0.8h, v0.8b, #0 WORD $0x0f08a421 // sshll v1.8h, v1.8b, #0 WORD $0x0e062c13 // smov w19, v0.h[1] WORD $0x0e022c14 // smov w20, v0.h[0] WORD $0x0e0a2c15 // smov w21, v0.h[2] WORD $0x0e0e2c16 // smov w22, v0.h[3] WORD $0x0e122c17 // smov w23, v0.h[4] WORD $0x1ad00e30 // sdiv w16, w17, w16 WORD $0x0e0a2c71 // smov w17, v3.h[2] WORD $0x1ad10e51 // sdiv w17, w18, w17 WORD $0x0e0e2c72 // smov w18, v3.h[3] WORD $0x1e270202 // fmov s2, w16 WORD $0x0e162c30 // smov w16, v1.h[5] WORD $0x4e061de2 // mov v2.h[1], w15 WORD $0x1ad20c72 // sdiv w18, w3, w18 WORD $0x0e122c63 // smov w3, v3.h[4] WORD $0x4e0a1e22 // mov v2.h[2], w17 WORD $0x0e1a2c11 // smov w17, v0.h[6] WORD $0x1ac30c83 // sdiv w3, w4, w3 WORD $0x0e162c64 // smov w4, v3.h[5] WORD $0x4e0e1e42 // mov v2.h[3], w18 WORD $0x0e1e2c12 // smov w18, v0.h[7] WORD $0x1ac40ca4 // sdiv w4, w5, w4 WORD $0x0e1a2c65 // smov w5, v3.h[6] WORD $0x4e121c62 // mov v2.h[4], w3 WORD $0x1ac50cc5 // sdiv w5, w6, w5 WORD $0x0e1e2c66 // smov w6, v3.h[7] WORD $0x4e161c82 // mov v2.h[5], w4 WORD $0x1ac60ce6 // sdiv w6, w7, w6 WORD $0x0e062c27 // smov w7, v1.h[1] WORD $0x4e1a1ca2 // mov v2.h[6], w5 WORD $0x1ac70e67 // sdiv w7, w19, w7 WORD $0x0e022c33 // smov w19, v1.h[0] WORD $0x4e1e1cc2 // mov v2.h[7], w6 WORD $0x1ad30e93 // sdiv w19, w20, w19 WORD $0x0e0a2c34 // smov w20, v1.h[2] WORD $0x1ad40eb4 // sdiv w20, w21, w20 WORD $0x0e0e2c35 // smov w21, v1.h[3] WORD $0x1e270263 // fmov s3, w19 WORD $0x0e162c13 // smov w19, v0.h[5] WORD $0x4e061ce3 // mov v3.h[1], w7 WORD $0x1ad50ed5 // sdiv w21, w22, w21 WORD $0x0e122c36 // smov w22, v1.h[4] WORD $0x4e0a1e83 // mov v3.h[2], w20 WORD $0x1ad60ef6 // sdiv w22, w23, w22 WORD $0x4e0e1ea3 // mov v3.h[3], w21 WORD $0x1ad00e6f // sdiv w15, w19, w16 WORD $0x0e1a2c30 // smov w16, v1.h[6] WORD $0x4e121ec3 // mov v3.h[4], w22 WORD $0x1ad00e30 // sdiv w16, w17, w16 WORD $0x0e1e2c31 // smov w17, v1.h[7] WORD $0x4e161de3 // mov v3.h[5], w15 WORD $0x1ad10e4f // sdiv w15, w18, w17 WORD $0x4e1a1e03 // mov v3.h[6], w16 WORD $0x4e1e1de3 // mov v3.h[7], w15 WORD $0x4e021860 // uzp1 v0.16b, v3.16b, v2.16b WORD $0x3c810560 // str q0, [x11], #16 WORD $0x54fff6e1 // b.ne .LBB34_6 WORD $0xb400016a // cbz x10, .LBB34_10 LBB34_8: WORD $0x8b09004a // add x10, x2, x9 WORD $0x8b09002b // add x11, x1, x9 WORD $0x8b09000c // add x12, x0, x9 WORD $0xcb090108 // sub x8, x8, x9 LBB34_9: WORD $0x38c01589 // ldrsb w9, [x12], #1 WORD $0x38c0156d // ldrsb w13, [x11], #1 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1acd0d29 // sdiv w9, w9, w13 WORD $0x38001549 // strb w9, [x10], #1 WORD $0x54ffff61 // b.ne .LBB34_9 LBB34_10: WORD $0xa9424ff4 // ldp x20, x19, [sp, #32] WORD $0xa94157f6 // ldp x22, x21, [sp, #16] WORD $0xf84307f7 // ldr x23, [sp], #48 WORD $0xd65f03c0 // ret TEXT ·_int16_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt .LBB35_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x540000c2 // b.hs .LBB35_4 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x2a1f03ea // mov w10, wzr WORD $0x14000013 // b .LBB35_7 LBB35_3: WORD $0x7900003f // strh wzr, [x1] WORD $0xd65f03c0 // ret LBB35_4: WORD $0x92400c4b // and x11, x2, #0xf WORD $0x9100400a // add x10, x0, #16 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x6f00e400 // movi v0.2d, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi v1.2d, #0000000000000000 LBB35_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x9100814a // add x10, x10, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x4e608440 // add v0.8h, v2.8h, v0.8h WORD $0x4e618461 // add v1.8h, v3.8h, v1.8h WORD $0x54ffff61 // b.ne .LBB35_5 WORD $0x4e608420 // add v0.8h, v1.8h, v0.8h WORD $0x4e71b800 // addv h0, v0.8h WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, .LBB35_9 LBB35_7: WORD $0x8b09040b // add x11, x0, x9, lsl #1 WORD $0xcb090108 // sub x8, x8, x9 LBB35_8: WORD $0x78402569 // ldrh w9, [x11], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0a012a // add w10, w9, w10 WORD $0x54ffffa1 // b.ne .LBB35_8 LBB35_9: WORD $0x7900002a // strh w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_int16_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x79400008 // ldrh w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003cb // b.lt .LBB36_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100413f // cmp x9, #16 WORD $0x54000062 // b.hs .LBB36_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000011 // b .LBB36_6 LBB36_3: WORD $0x92400c4b // and x11, x2, #0xf WORD $0x4e020d00 // dup v0.8h, w8 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB36_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x91008108 // add x8, x8, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x4e606c40 // smin v0.8h, v2.8h, v0.8h WORD $0x4e616c61 // smin v1.8h, v3.8h, v1.8h WORD $0x54ffff61 // b.ne .LBB36_4 WORD $0x4e616c00 // smin v0.8h, v0.8h, v1.8h WORD $0x4e71a800 // sminv h0, v0.8h WORD $0x1e260008 // fmov w8, s0 WORD $0xb400012b // cbz x11, .LBB36_8 LBB36_6: WORD $0x8b0a040b // add x11, x0, x10, lsl #1 WORD $0xcb0a0129 // sub x9, x9, x10 LBB36_7: WORD $0x78c0256a // ldrsh w10, [x11], #2 WORD $0x13003d08 // sxth w8, w8 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a88b148 // csel w8, w10, w8, lt WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff61 // b.ne .LBB36_7 LBB36_8: WORD $0x79000028 // strh w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_int16_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x79400008 // ldrh w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003cb // b.lt .LBB37_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100413f // cmp x9, #16 WORD $0x54000062 // b.hs .LBB37_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000011 // b .LBB37_6 LBB37_3: WORD $0x92400c4b // and x11, x2, #0xf WORD $0x4e020d00 // dup v0.8h, w8 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB37_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x91008108 // add x8, x8, #32 WORD $0xf100418c // subs x12, x12, #16 WORD $0x4e606440 // smax v0.8h, v2.8h, v0.8h WORD $0x4e616461 // smax v1.8h, v3.8h, v1.8h WORD $0x54ffff61 // b.ne .LBB37_4 WORD $0x4e616400 // smax v0.8h, v0.8h, v1.8h WORD $0x4e70a800 // smaxv h0, v0.8h WORD $0x1e260008 // fmov w8, s0 WORD $0xb400012b // cbz x11, .LBB37_8 LBB37_6: WORD $0x8b0a040b // add x11, x0, x10, lsl #1 WORD $0xcb0a0129 // sub x9, x9, x10 LBB37_7: WORD $0x78c0256a // ldrsh w10, [x11], #2 WORD $0x13003d08 // sxth w8, w8 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a88c148 // csel w8, w10, w8, gt WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff61 // b.ne .LBB37_7 LBB37_8: WORD $0x79000028 // strh w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_int16_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB38_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs .LBB38_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB38_8 LBB38_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB38_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB38_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB38_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4e608440 // add v0.8h, v2.8h, v0.8h WORD $0x4e618461 // add v1.8h, v3.8h, v1.8h WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB38_6 WORD $0xb400018a // cbz x10, .LBB38_10 LBB38_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB38_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0x78002549 // strh w9, [x10], #2 WORD $0x54ffff61 // b.ne .LBB38_9 LBB38_10: WORD $0xd65f03c0 // ret TEXT ·_int16_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB39_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs .LBB39_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB39_8 LBB39_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB39_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB39_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB39_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x6e628400 // sub v0.8h, v0.8h, v2.8h WORD $0x6e638421 // sub v1.8h, v1.8h, v3.8h WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB39_6 WORD $0xb400018a // cbz x10, .LBB39_10 LBB39_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB39_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0x78002549 // strh w9, [x10], #2 WORD $0x54ffff61 // b.ne .LBB39_9 LBB39_10: WORD $0xd65f03c0 // ret TEXT ·_int16_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB40_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100411f // cmp x8, #16 WORD $0x54000062 // b.hs .LBB40_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB40_8 LBB40_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB40_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB40_8 WORD $0x92400c6a // and x10, x3, #0xf WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB40_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10041ce // subs x14, x14, #16 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4e609c40 // mul v0.8h, v2.8h, v0.8h WORD $0x4e619c61 // mul v1.8h, v3.8h, v1.8h WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB40_6 WORD $0xb400018a // cbz x10, .LBB40_10 LBB40_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB40_9: WORD $0x78402589 // ldrh w9, [x12], #2 WORD $0x7840256d // ldrh w13, [x11], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0x78002549 // strh w9, [x10], #2 WORD $0x54ffff61 // b.ne .LBB40_9 LBB40_10: WORD $0xd65f03c0 // ret TEXT ·_int16_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400092b // b.lt .LBB41_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB41_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000039 // b .LBB41_8 LBB41_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x540006a3 // b.lo .LBB41_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x54000643 // b.lo .LBB41_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0xaa0203eb // mov x11, x2 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0903ec // mov x12, x9 WORD $0xaa0003ee // mov x14, x0 LBB41_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0xf100218c // subs x12, x12, #8 WORD $0x4f10a402 // sshll2 v2.4s, v0.8h, #0 WORD $0x4f10a423 // sshll2 v3.4s, v1.8h, #0 WORD $0x0e0c3c50 // mov w16, v2.s[1] WORD $0x0e0c3c6f // mov w15, v3.s[1] WORD $0x1e260051 // fmov w17, s2 WORD $0x0e143c52 // mov w18, v2.s[2] WORD $0x0e1c3c43 // mov w3, v2.s[3] WORD $0x0f10a400 // sshll v0.4s, v0.4h, #0 WORD $0x0f10a421 // sshll v1.4s, v1.4h, #0 WORD $0x1acf0e0f // sdiv w15, w16, w15 WORD $0x1e260070 // fmov w16, s3 WORD $0x0e0c3c04 // mov w4, v0.s[1] WORD $0x1e260005 // fmov w5, s0 WORD $0x0e143c06 // mov w6, v0.s[2] WORD $0x0e1c3c07 // mov w7, v0.s[3] WORD $0x1ad00e30 // sdiv w16, w17, w16 WORD $0x0e143c71 // mov w17, v3.s[2] WORD $0x1ad10e51 // sdiv w17, w18, w17 WORD $0x0e1c3c72 // mov w18, v3.s[3] WORD $0x1e270200 // fmov s0, w16 WORD $0x4e0c1de0 // mov v0.s[1], w15 WORD $0x1ad20c72 // sdiv w18, w3, w18 WORD $0x0e0c3c23 // mov w3, v1.s[1] WORD $0x4e141e20 // mov v0.s[2], w17 WORD $0x1ac30c83 // sdiv w3, w4, w3 WORD $0x1e260024 // fmov w4, s1 WORD $0x4e1c1e40 // mov v0.s[3], w18 WORD $0x1ac40ca4 // sdiv w4, w5, w4 WORD $0x0e143c25 // mov w5, v1.s[2] WORD $0x1ac50cc5 // sdiv w5, w6, w5 WORD $0x0e1c3c26 // mov w6, v1.s[3] WORD $0x1e270081 // fmov s1, w4 WORD $0x4e0c1c61 // mov v1.s[1], w3 WORD $0x1ac60cef // sdiv w15, w7, w6 WORD $0x4e141ca1 // mov v1.s[2], w5 WORD $0x4e1c1de1 // mov v1.s[3], w15 WORD $0x4e401820 // uzp1 v0.8h, v1.8h, v0.8h WORD $0x3c810560 // str q0, [x11], #16 WORD $0x54fffae1 // b.ne .LBB41_6 WORD $0xb400018a // cbz x10, .LBB41_10 LBB41_8: WORD $0xd37ff92c // lsl x12, x9, #1 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB41_9: WORD $0x78c02589 // ldrsh w9, [x12], #2 WORD $0x78c0256d // ldrsh w13, [x11], #2 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1acd0d29 // sdiv w9, w9, w13 WORD $0x78002549 // strh w9, [x10], #2 WORD $0x54ffff61 // b.ne .LBB41_9 LBB41_10: WORD $0xd65f03c0 // ret TEXT ·_int32_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt .LBB42_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540000c2 // b.hs .LBB42_4 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x2a1f03ea // mov w10, wzr WORD $0x14000013 // b .LBB42_7 LBB42_3: WORD $0xb900003f // str wzr, [x1] WORD $0xd65f03c0 // ret LBB42_4: WORD $0x9240084b // and x11, x2, #0x7 WORD $0x9100400a // add x10, x0, #16 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x6f00e400 // movi v0.2d, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi v1.2d, #0000000000000000 LBB42_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x9100814a // add x10, x10, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x4ea08440 // add v0.4s, v2.4s, v0.4s WORD $0x4ea18461 // add v1.4s, v3.4s, v1.4s WORD $0x54ffff61 // b.ne .LBB42_5 WORD $0x4ea08420 // add v0.4s, v1.4s, v0.4s WORD $0x4eb1b800 // addv s0, v0.4s WORD $0x1e26000a // fmov w10, s0 WORD $0xb40000eb // cbz x11, .LBB42_9 LBB42_7: WORD $0x8b09080b // add x11, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 LBB42_8: WORD $0xb8404569 // ldr w9, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0a012a // add w10, w9, w10 WORD $0x54ffffa1 // b.ne .LBB42_8 LBB42_9: WORD $0xb900002a // str w10, [x1] WORD $0xd65f03c0 // ret TEXT ·_int32_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xb9400008 // ldr w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003ab // b.lt .LBB43_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100213f // cmp x9, #8 WORD $0x54000062 // b.hs .LBB43_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000011 // b .LBB43_6 LBB43_3: WORD $0x9240084b // and x11, x2, #0x7 WORD $0x4e040d00 // dup v0.4s, w8 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB43_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x91008108 // add x8, x8, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x4ea06c40 // smin v0.4s, v2.4s, v0.4s WORD $0x4ea16c61 // smin v1.4s, v3.4s, v1.4s WORD $0x54ffff61 // b.ne .LBB43_4 WORD $0x4ea16c00 // smin v0.4s, v0.4s, v1.4s WORD $0x4eb1a800 // sminv s0, v0.4s WORD $0x1e260008 // fmov w8, s0 WORD $0xb400010b // cbz x11, .LBB43_8 LBB43_6: WORD $0x8b0a080b // add x11, x0, x10, lsl #2 WORD $0xcb0a0129 // sub x9, x9, x10 LBB43_7: WORD $0xb840456a // ldr w10, [x11], #4 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a88b148 // csel w8, w10, w8, lt WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff81 // b.ne .LBB43_7 LBB43_8: WORD $0xb9000028 // str w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_int32_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xb9400008 // ldr w8, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x540003ab // b.lt .LBB44_8 WORD $0x92407c49 // and x9, x2, #0xffffffff WORD $0xf100213f // cmp x9, #8 WORD $0x54000062 // b.hs .LBB44_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000011 // b .LBB44_6 LBB44_3: WORD $0x9240084b // and x11, x2, #0x7 WORD $0x4e040d00 // dup v0.4s, w8 WORD $0xcb0b012a // sub x10, x9, x11 WORD $0x91004008 // add x8, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB44_4: WORD $0xad7f8d02 // ldp q2, q3, [x8, #-16] WORD $0x91008108 // add x8, x8, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x4ea06440 // smax v0.4s, v2.4s, v0.4s WORD $0x4ea16461 // smax v1.4s, v3.4s, v1.4s WORD $0x54ffff61 // b.ne .LBB44_4 WORD $0x4ea16400 // smax v0.4s, v0.4s, v1.4s WORD $0x4eb0a800 // smaxv s0, v0.4s WORD $0x1e260008 // fmov w8, s0 WORD $0xb400010b // cbz x11, .LBB44_8 LBB44_6: WORD $0x8b0a080b // add x11, x0, x10, lsl #2 WORD $0xcb0a0129 // sub x9, x9, x10 LBB44_7: WORD $0xb840456a // ldr w10, [x11], #4 WORD $0x6b08015f // cmp w10, w8 WORD $0x1a88c148 // csel w8, w10, w8, gt WORD $0xf1000529 // subs x9, x9, #1 WORD $0x54ffff81 // b.ne .LBB44_7 LBB44_8: WORD $0xb9000028 // str w8, [x1] WORD $0xd65f03c0 // ret TEXT ·_int32_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB45_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB45_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB45_8 LBB45_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB45_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB45_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB45_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4ea08440 // add v0.4s, v2.4s, v0.4s WORD $0x4ea18461 // add v1.4s, v3.4s, v1.4s WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB45_6 WORD $0xb400018a // cbz x10, .LBB45_10 LBB45_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB45_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x0b0901a9 // add w9, w13, w9 WORD $0xb8004549 // str w9, [x10], #4 WORD $0x54ffff61 // b.ne .LBB45_9 LBB45_10: WORD $0xd65f03c0 // ret TEXT ·_int32_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB46_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB46_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB46_8 LBB46_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB46_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB46_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB46_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x6ea28400 // sub v0.4s, v0.4s, v2.4s WORD $0x6ea38421 // sub v1.4s, v1.4s, v3.4s WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB46_6 WORD $0xb400018a // cbz x10, .LBB46_10 LBB46_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB46_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x4b0d0129 // sub w9, w9, w13 WORD $0xb8004549 // str w9, [x10], #4 WORD $0x54ffff61 // b.ne .LBB46_9 LBB46_10: WORD $0xd65f03c0 // ret TEXT ·_int32_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB47_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB47_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB47_8 LBB47_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB47_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB47_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB47_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4ea09c40 // mul v0.4s, v2.4s, v0.4s WORD $0x4ea19c61 // mul v1.4s, v3.4s, v1.4s WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB47_6 WORD $0xb400018a // cbz x10, .LBB47_10 LBB47_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB47_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1b097da9 // mul w9, w13, w9 WORD $0xb8004549 // str w9, [x10], #4 WORD $0x54ffff61 // b.ne .LBB47_9 LBB47_10: WORD $0xd65f03c0 // ret TEXT ·_int32_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400068b // b.lt .LBB48_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB48_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000024 // b .LBB48_8 LBB48_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000403 // b.lo .LBB48_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540003a3 // b.lo .LBB48_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0xaa0203eb // mov x11, x2 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0903ec // mov x12, x9 WORD $0xaa0003ee // mov x14, x0 LBB48_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0xf100118c // subs x12, x12, #4 WORD $0x0e0c3c10 // mov w16, v0.s[1] WORD $0x1e260011 // fmov w17, s0 WORD $0x0e0c3c2f // mov w15, v1.s[1] WORD $0x0e143c12 // mov w18, v0.s[2] WORD $0x0e1c3c03 // mov w3, v0.s[3] WORD $0x1acf0e0f // sdiv w15, w16, w15 WORD $0x1e260030 // fmov w16, s1 WORD $0x1ad00e30 // sdiv w16, w17, w16 WORD $0x0e143c31 // mov w17, v1.s[2] WORD $0x1ad10e51 // sdiv w17, w18, w17 WORD $0x0e1c3c32 // mov w18, v1.s[3] WORD $0x1e270200 // fmov s0, w16 WORD $0x4e0c1de0 // mov v0.s[1], w15 WORD $0x1ad20c6f // sdiv w15, w3, w18 WORD $0x4e141e20 // mov v0.s[2], w17 WORD $0x4e1c1de0 // mov v0.s[3], w15 WORD $0x3c810560 // str q0, [x11], #16 WORD $0x54fffd81 // b.ne .LBB48_6 WORD $0xb400018a // cbz x10, .LBB48_10 LBB48_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB48_9: WORD $0xb8404589 // ldr w9, [x12], #4 WORD $0xb840456d // ldr w13, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1acd0d29 // sdiv w9, w9, w13 WORD $0xb8004549 // str w9, [x10], #4 WORD $0x54ffff61 // b.ne .LBB48_9 LBB48_10: WORD $0xd65f03c0 // ret TEXT ·_int64_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt .LBB49_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x540000c2 // b.hs .LBB49_4 WORD $0xaa1f03e9 // mov x9, xzr WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000013 // b .LBB49_7 LBB49_3: WORD $0xf900003f // str xzr, [x1] WORD $0xd65f03c0 // ret LBB49_4: WORD $0x9240044b // and x11, x2, #0x3 WORD $0x9100400a // add x10, x0, #16 WORD $0xcb0b0109 // sub x9, x8, x11 WORD $0x6f00e400 // movi v0.2d, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi v1.2d, #0000000000000000 LBB49_5: WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16] WORD $0x9100814a // add x10, x10, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x4ee08440 // add v0.2d, v2.2d, v0.2d WORD $0x4ee18461 // add v1.2d, v3.2d, v1.2d WORD $0x54ffff61 // b.ne .LBB49_5 WORD $0x4ee08420 // add v0.2d, v1.2d, v0.2d WORD $0x5ef1b800 // addp d0, v0.2d WORD $0x9e66000a // fmov x10, d0 WORD $0xb40000eb // cbz x11, .LBB49_9 LBB49_7: WORD $0x8b090c0b // add x11, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 LBB49_8: WORD $0xf8408569 // ldr x9, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x8b0a012a // add x10, x9, x10 WORD $0x54ffffa1 // b.ne .LBB49_8 LBB49_9: WORD $0xf900002a // str x10, [x1] WORD $0xd65f03c0 // ret TEXT ·_int64_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xf9400009 // ldr x9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400044b // b.lt .LBB50_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB50_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000016 // b .LBB50_6 LBB50_3: WORD $0x9240044b // and x11, x2, #0x3 WORD $0x4e080d20 // dup v0.2d, x9 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB50_4: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x91008129 // add x9, x9, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x4ee23404 // cmgt v4.2d, v0.2d, v2.2d WORD $0x4ee33425 // cmgt v5.2d, v1.2d, v3.2d WORD $0x6ea41c40 // bit v0.16b, v2.16b, v4.16b WORD $0x6ea51c61 // bit v1.16b, v3.16b, v5.16b WORD $0x54ffff21 // b.ne .LBB50_4 WORD $0x4ee03422 // cmgt v2.2d, v1.2d, v0.2d WORD $0x6ee21c20 // bif v0.16b, v1.16b, v2.16b WORD $0x6e004001 // ext v1.16b, v0.16b, v0.16b, #8 WORD $0x5ee03422 // cmgt d2, d1, d0 WORD $0x2ee21c20 // bif v0.8b, v1.8b, v2.8b WORD $0x9e660009 // fmov x9, d0 WORD $0xb400010b // cbz x11, .LBB50_8 LBB50_6: WORD $0x8b0a0c0b // add x11, x0, x10, lsl #3 WORD $0xcb0a0108 // sub x8, x8, x10 LBB50_7: WORD $0xf840856a // ldr x10, [x11], #8 WORD $0xeb09015f // cmp x10, x9 WORD $0x9a89b149 // csel x9, x10, x9, lt WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff81 // b.ne .LBB50_7 LBB50_8: WORD $0xf9000029 // str x9, [x1] WORD $0xd65f03c0 // ret TEXT ·_int64_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xf9400009 // ldr x9, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400044b // b.lt .LBB51_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB51_3 WORD $0xaa1f03ea // mov x10, xzr WORD $0x14000016 // b .LBB51_6 LBB51_3: WORD $0x9240044b // and x11, x2, #0x3 WORD $0x4e080d20 // dup v0.2d, x9 WORD $0xcb0b010a // sub x10, x8, x11 WORD $0x91004009 // add x9, x0, #16 WORD $0xaa0a03ec // mov x12, x10 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB51_4: WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16] WORD $0x91008129 // add x9, x9, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x4ee03444 // cmgt v4.2d, v2.2d, v0.2d WORD $0x4ee13465 // cmgt v5.2d, v3.2d, v1.2d WORD $0x6ea41c40 // bit v0.16b, v2.16b, v4.16b WORD $0x6ea51c61 // bit v1.16b, v3.16b, v5.16b WORD $0x54ffff21 // b.ne .LBB51_4 WORD $0x4ee13402 // cmgt v2.2d, v0.2d, v1.2d WORD $0x6ee21c20 // bif v0.16b, v1.16b, v2.16b WORD $0x6e004001 // ext v1.16b, v0.16b, v0.16b, #8 WORD $0x5ee13402 // cmgt d2, d0, d1 WORD $0x2ee21c20 // bif v0.8b, v1.8b, v2.8b WORD $0x9e660009 // fmov x9, d0 WORD $0xb400010b // cbz x11, .LBB51_8 LBB51_6: WORD $0x8b0a0c0b // add x11, x0, x10, lsl #3 WORD $0xcb0a0108 // sub x8, x8, x10 LBB51_7: WORD $0xf840856a // ldr x10, [x11], #8 WORD $0xeb09015f // cmp x10, x9 WORD $0x9a89c149 // csel x9, x10, x9, gt WORD $0xf1000508 // subs x8, x8, #1 WORD $0x54ffff81 // b.ne .LBB51_7 LBB51_8: WORD $0xf9000029 // str x9, [x1] WORD $0xd65f03c0 // ret TEXT ·_int64_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB52_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB52_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB52_8 LBB52_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB52_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB52_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB52_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4ee08440 // add v0.2d, v2.2d, v0.2d WORD $0x4ee18461 // add v1.2d, v3.2d, v1.2d WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB52_6 WORD $0xb400018a // cbz x10, .LBB52_10 LBB52_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB52_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x8b0901a9 // add x9, x13, x9 WORD $0xf8008549 // str x9, [x10], #8 WORD $0x54ffff61 // b.ne .LBB52_9 LBB52_10: WORD $0xd65f03c0 // ret TEXT ·_int64_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB53_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB53_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB53_8 LBB53_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB53_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB53_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB53_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x6ee28400 // sub v0.2d, v0.2d, v2.2d WORD $0x6ee38421 // sub v1.2d, v1.2d, v3.2d WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB53_6 WORD $0xb400018a // cbz x10, .LBB53_10 LBB53_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB53_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0xcb0d0129 // sub x9, x9, x13 WORD $0xf8008549 // str x9, [x10], #8 WORD $0x54ffff61 // b.ne .LBB53_9 LBB53_10: WORD $0xd65f03c0 // ret TEXT ·_int64_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x540006eb // b.lt .LBB54_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB54_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000027 // b .LBB54_8 LBB54_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x54000463 // b.lo .LBB54_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000403 // b.lo .LBB54_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB54_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x9e660011 // fmov x17, d0 WORD $0x4e183c0f // mov x15, v0.d[1] WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9e660024 // fmov x4, d1 WORD $0x9100818c // add x12, x12, #32 WORD $0x4e183c23 // mov x3, v1.d[1] WORD $0x9e660052 // fmov x18, d2 WORD $0x4e183c50 // mov x16, v2.d[1] WORD $0x9e660065 // fmov x5, d3 WORD $0x4e183c66 // mov x6, v3.d[1] WORD $0x9b117e51 // mul x17, x18, x17 WORD $0x9b047cb2 // mul x18, x5, x4 WORD $0x9b0f7e0f // mul x15, x16, x15 WORD $0x9b037cd0 // mul x16, x6, x3 WORD $0x9e670220 // fmov d0, x17 WORD $0x9e670241 // fmov d1, x18 WORD $0x4e181de0 // mov v0.d[1], x15 WORD $0x4e181e01 // mov v1.d[1], x16 WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffd21 // b.ne .LBB54_6 WORD $0xb400018a // cbz x10, .LBB54_10 LBB54_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB54_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x9b097da9 // mul x9, x13, x9 WORD $0xf8008549 // str x9, [x10], #8 WORD $0x54ffff61 // b.ne .LBB54_9 LBB54_10: WORD $0xd65f03c0 // ret TEXT ·_int64_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400058b // b.lt .LBB55_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100091f // cmp x8, #2 WORD $0x54000062 // b.hs .LBB55_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x1400001c // b .LBB55_8 LBB55_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100415f // cmp x10, #16 WORD $0x54000303 // b.lo .LBB55_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100415f // cmp x10, #16 WORD $0x540002a3 // b.lo .LBB55_8 WORD $0x9240006a // and x10, x3, #0x1 WORD $0xaa0203eb // mov x11, x2 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0xaa0103ed // mov x13, x1 WORD $0xaa0903ec // mov x12, x9 WORD $0xaa0003ee // mov x14, x0 LBB55_6: WORD $0x3cc105c0 // ldr q0, [x14], #16 WORD $0x3cc105a1 // ldr q1, [x13], #16 WORD $0xf100098c // subs x12, x12, #2 WORD $0x9e660010 // fmov x16, d0 WORD $0x9e66002f // fmov x15, d1 WORD $0x4e183c11 // mov x17, v0.d[1] WORD $0x9acf0e0f // sdiv x15, x16, x15 WORD $0x4e183c30 // mov x16, v1.d[1] WORD $0x9ad00e30 // sdiv x16, x17, x16 WORD $0x9e6701e0 // fmov d0, x15 WORD $0x4e181e00 // mov v0.d[1], x16 WORD $0x3c810560 // str q0, [x11], #16 WORD $0x54fffe81 // b.ne .LBB55_6 WORD $0xb400018a // cbz x10, .LBB55_10 LBB55_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB55_9: WORD $0xf8408589 // ldr x9, [x12], #8 WORD $0xf840856d // ldr x13, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x9acd0d29 // sdiv x9, x9, x13 WORD $0xf8008549 // str x9, [x10], #8 WORD $0x54ffff61 // b.ne .LBB55_9 LBB55_10: WORD $0xd65f03c0 // ret TEXT ·_float32_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt .LBB56_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x540000e2 // b.hs .LBB56_4 WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000014 // b .LBB56_7 LBB56_3: WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0xbd000020 // str s0, [x1] WORD $0xd65f03c0 // ret LBB56_4: WORD $0x9240084a // and x10, x2, #0x7 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x6f00e400 // movi v0.2d, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi v1.2d, #0000000000000000 LBB56_5: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x4e20d440 // fadd v0.4s, v2.4s, v0.4s WORD $0x4e21d461 // fadd v1.4s, v3.4s, v1.4s WORD $0x54ffff61 // b.ne .LBB56_5 WORD $0x4e20d420 // fadd v0.4s, v1.4s, v0.4s WORD $0x6e20d400 // faddp v0.4s, v0.4s, v0.4s WORD $0x7e30d800 // faddp s0, v0.2s WORD $0xb40000ea // cbz x10, .LBB56_9 LBB56_7: WORD $0x8b09080a // add x10, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 LBB56_8: WORD $0xbc404541 // ldr s1, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e202820 // fadd s0, s1, s0 WORD $0x54ffffa1 // b.ne .LBB56_8 LBB56_9: WORD $0xbd000020 // str s0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float32_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xbd400000 // ldr s0, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400036b // b.lt .LBB57_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB57_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000010 // b .LBB57_6 LBB57_3: WORD $0x9240084a // and x10, x2, #0x7 WORD $0x4e040400 // dup v0.4s, v0.s[0] WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0xaa0903ec // mov x12, x9 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB57_4: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x4ea0c440 // fminnm v0.4s, v2.4s, v0.4s WORD $0x4ea1c461 // fminnm v1.4s, v3.4s, v1.4s WORD $0x54ffff61 // b.ne .LBB57_4 WORD $0x4ea1c400 // fminnm v0.4s, v0.4s, v1.4s WORD $0x6eb0c800 // fminnmv s0, v0.4s WORD $0xb40000ea // cbz x10, .LBB57_8 LBB57_6: WORD $0x8b09080a // add x10, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 LBB57_7: WORD $0xbc404541 // ldr s1, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e207820 // fminnm s0, s1, s0 WORD $0x54ffffa1 // b.ne .LBB57_7 LBB57_8: WORD $0xbd000020 // str s0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float32_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xbd400000 // ldr s0, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400036b // b.lt .LBB58_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB58_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000010 // b .LBB58_6 LBB58_3: WORD $0x9240084a // and x10, x2, #0x7 WORD $0x4e040400 // dup v0.4s, v0.s[0] WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0xaa0903ec // mov x12, x9 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB58_4: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf100218c // subs x12, x12, #8 WORD $0x4e20c440 // fmaxnm v0.4s, v2.4s, v0.4s WORD $0x4e21c461 // fmaxnm v1.4s, v3.4s, v1.4s WORD $0x54ffff61 // b.ne .LBB58_4 WORD $0x4e21c400 // fmaxnm v0.4s, v0.4s, v1.4s WORD $0x6e30c800 // fmaxnmv s0, v0.4s WORD $0xb40000ea // cbz x10, .LBB58_8 LBB58_6: WORD $0x8b09080a // add x10, x0, x9, lsl #2 WORD $0xcb090108 // sub x8, x8, x9 LBB58_7: WORD $0xbc404541 // ldr s1, [x10], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e206820 // fmaxnm s0, s1, s0 WORD $0x54ffffa1 // b.ne .LBB58_7 LBB58_8: WORD $0xbd000020 // str s0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float32_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB59_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB59_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB59_8 LBB59_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB59_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB59_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB59_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4e20d440 // fadd v0.4s, v2.4s, v0.4s WORD $0x4e21d461 // fadd v1.4s, v3.4s, v1.4s WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB59_6 WORD $0xb400018a // cbz x10, .LBB59_10 LBB59_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB59_9: WORD $0xbc404580 // ldr s0, [x12], #4 WORD $0xbc404561 // ldr s1, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e202820 // fadd s0, s1, s0 WORD $0xbc004540 // str s0, [x10], #4 WORD $0x54ffff61 // b.ne .LBB59_9 LBB59_10: WORD $0xd65f03c0 // ret TEXT ·_float32_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB60_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB60_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB60_8 LBB60_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB60_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB60_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB60_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4ea2d400 // fsub v0.4s, v0.4s, v2.4s WORD $0x4ea3d421 // fsub v1.4s, v1.4s, v3.4s WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB60_6 WORD $0xb400018a // cbz x10, .LBB60_10 LBB60_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB60_9: WORD $0xbc404580 // ldr s0, [x12], #4 WORD $0xbc404561 // ldr s1, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e213800 // fsub s0, s0, s1 WORD $0xbc004540 // str s0, [x10], #4 WORD $0x54ffff61 // b.ne .LBB60_9 LBB60_10: WORD $0xd65f03c0 // ret TEXT ·_float32_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB61_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB61_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB61_8 LBB61_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB61_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB61_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB61_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x6e20dc40 // fmul v0.4s, v2.4s, v0.4s WORD $0x6e21dc61 // fmul v1.4s, v3.4s, v1.4s WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB61_6 WORD $0xb400018a // cbz x10, .LBB61_10 LBB61_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB61_9: WORD $0xbc404580 // ldr s0, [x12], #4 WORD $0xbc404561 // ldr s1, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e200820 // fmul s0, s1, s0 WORD $0xbc004540 // str s0, [x10], #4 WORD $0x54ffff61 // b.ne .LBB61_9 LBB61_10: WORD $0xd65f03c0 // ret TEXT ·_float32_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB62_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100211f // cmp x8, #8 WORD $0x54000062 // b.hs .LBB62_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB62_8 LBB62_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB62_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB62_8 WORD $0x9240086a // and x10, x3, #0x7 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB62_6: WORD $0xad7f8981 // ldp q1, q2, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0xf10021ce // subs x14, x14, #8 WORD $0x3cdf0160 // ldur q0, [x11, #-16] WORD $0x6e21fc00 // fdiv v0.4s, v0.4s, v1.4s WORD $0x3cc20561 // ldr q1, [x11], #32 WORD $0x6e22fc21 // fdiv v1.4s, v1.4s, v2.4s WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB62_6 WORD $0xb400018a // cbz x10, .LBB62_10 LBB62_8: WORD $0xd37ef52c // lsl x12, x9, #2 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB62_9: WORD $0xbc404580 // ldr s0, [x12], #4 WORD $0xbc404561 // ldr s1, [x11], #4 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e211800 // fdiv s0, s0, s1 WORD $0xbc004540 // str s0, [x10], #4 WORD $0x54ffff61 // b.ne .LBB62_9 LBB62_10: WORD $0xd65f03c0 // ret TEXT ·_float64_sum(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0x7100045f // cmp w2, #1 WORD $0x540000eb // b.lt .LBB63_3 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x540000e2 // b.hs .LBB63_4 WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000013 // b .LBB63_7 LBB63_3: WORD $0x2f00e400 // movi d0, #0000000000000000 WORD $0xfd000020 // str d0, [x1] WORD $0xd65f03c0 // ret LBB63_4: WORD $0x9240044a // and x10, x2, #0x3 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x6f00e400 // movi v0.2d, #0000000000000000 WORD $0xaa0903ec // mov x12, x9 WORD $0x6f00e401 // movi v1.2d, #0000000000000000 LBB63_5: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x4e60d440 // fadd v0.2d, v2.2d, v0.2d WORD $0x4e61d461 // fadd v1.2d, v3.2d, v1.2d WORD $0x54ffff61 // b.ne .LBB63_5 WORD $0x4e60d420 // fadd v0.2d, v1.2d, v0.2d WORD $0x7e70d800 // faddp d0, v0.2d WORD $0xb40000ea // cbz x10, .LBB63_9 LBB63_7: WORD $0x8b090c0a // add x10, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 LBB63_8: WORD $0xfc408541 // ldr d1, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e602820 // fadd d0, d1, d0 WORD $0x54ffffa1 // b.ne .LBB63_8 LBB63_9: WORD $0xfd000020 // str d0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float64_min(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xfd400000 // ldr d0, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400036b // b.lt .LBB64_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB64_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000010 // b .LBB64_6 LBB64_3: WORD $0x9240044a // and x10, x2, #0x3 WORD $0x4e080400 // dup v0.2d, v0.d[0] WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0xaa0903ec // mov x12, x9 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB64_4: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x4ee0c440 // fminnm v0.2d, v2.2d, v0.2d WORD $0x4ee1c461 // fminnm v1.2d, v3.2d, v1.2d WORD $0x54ffff61 // b.ne .LBB64_4 WORD $0x4ee1c400 // fminnm v0.2d, v0.2d, v1.2d WORD $0x7ef0c800 // fminnmp d0, v0.2d WORD $0xb40000ea // cbz x10, .LBB64_8 LBB64_6: WORD $0x8b090c0a // add x10, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 LBB64_7: WORD $0xfc408541 // ldr d1, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e607820 // fminnm d0, d1, d0 WORD $0x54ffffa1 // b.ne .LBB64_7 LBB64_8: WORD $0xfd000020 // str d0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float64_max(SB), $0-32 MOVD input+0(FP), R0 MOVD result+8(FP), R1 MOVD size+16(FP), R2 WORD $0xfd400000 // ldr d0, [x0] WORD $0x7100045f // cmp w2, #1 WORD $0x5400036b // b.lt .LBB65_8 WORD $0x92407c48 // and x8, x2, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB65_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000010 // b .LBB65_6 LBB65_3: WORD $0x9240044a // and x10, x2, #0x3 WORD $0x4e080400 // dup v0.2d, v0.d[0] WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100400b // add x11, x0, #16 WORD $0xaa0903ec // mov x12, x9 WORD $0x4ea01c01 // mov v1.16b, v0.16b LBB65_4: WORD $0xad7f8d62 // ldp q2, q3, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf100118c // subs x12, x12, #4 WORD $0x4e60c440 // fmaxnm v0.2d, v2.2d, v0.2d WORD $0x4e61c461 // fmaxnm v1.2d, v3.2d, v1.2d WORD $0x54ffff61 // b.ne .LBB65_4 WORD $0x4e61c400 // fmaxnm v0.2d, v0.2d, v1.2d WORD $0x7e70c800 // fmaxnmp d0, v0.2d WORD $0xb40000ea // cbz x10, .LBB65_8 LBB65_6: WORD $0x8b090c0a // add x10, x0, x9, lsl #3 WORD $0xcb090108 // sub x8, x8, x9 LBB65_7: WORD $0xfc408541 // ldr d1, [x10], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e606820 // fmaxnm d0, d1, d0 WORD $0x54ffffa1 // b.ne .LBB65_7 LBB65_8: WORD $0xfd000020 // str d0, [x1] WORD $0xd65f03c0 // ret TEXT ·_float64_add(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB66_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB66_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB66_8 LBB66_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB66_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB66_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB66_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4e60d440 // fadd v0.2d, v2.2d, v0.2d WORD $0x4e61d461 // fadd v1.2d, v3.2d, v1.2d WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB66_6 WORD $0xb400018a // cbz x10, .LBB66_10 LBB66_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB66_9: WORD $0xfc408580 // ldr d0, [x12], #8 WORD $0xfc408561 // ldr d1, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e602820 // fadd d0, d1, d0 WORD $0xfc008540 // str d0, [x10], #8 WORD $0x54ffff61 // b.ne .LBB66_9 LBB66_10: WORD $0xd65f03c0 // ret TEXT ·_float64_sub(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB67_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB67_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB67_8 LBB67_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB67_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB67_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB67_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x4ee2d400 // fsub v0.2d, v0.2d, v2.2d WORD $0x4ee3d421 // fsub v1.2d, v1.2d, v3.2d WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB67_6 WORD $0xb400018a // cbz x10, .LBB67_10 LBB67_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB67_9: WORD $0xfc408580 // ldr d0, [x12], #8 WORD $0xfc408561 // ldr d1, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e613800 // fsub d0, d0, d1 WORD $0xfc008540 // str d0, [x10], #8 WORD $0x54ffff61 // b.ne .LBB67_9 LBB67_10: WORD $0xd65f03c0 // ret TEXT ·_float64_mul(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB68_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB68_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB68_8 LBB68_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB68_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB68_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB68_6: WORD $0xad7f8560 // ldp q0, q1, [x11, #-16] WORD $0x9100816b // add x11, x11, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0xad7f8d82 // ldp q2, q3, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0x6e60dc40 // fmul v0.2d, v2.2d, v0.2d WORD $0x6e61dc61 // fmul v1.2d, v3.2d, v1.2d WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB68_6 WORD $0xb400018a // cbz x10, .LBB68_10 LBB68_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB68_9: WORD $0xfc408580 // ldr d0, [x12], #8 WORD $0xfc408561 // ldr d1, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e600820 // fmul d0, d1, d0 WORD $0xfc008540 // str d0, [x10], #8 WORD $0x54ffff61 // b.ne .LBB68_9 LBB68_10: WORD $0xd65f03c0 // ret TEXT ·_float64_div(SB), $0-32 MOVD input1+0(FP), R0 MOVD input2+8(FP), R1 MOVD output+16(FP), R2 MOVD size+24(FP), R3 WORD $0x7100047f // cmp w3, #1 WORD $0x5400052b // b.lt .LBB69_10 WORD $0x92407c68 // and x8, x3, #0xffffffff WORD $0xf100111f // cmp x8, #4 WORD $0x54000062 // b.hs .LBB69_3 WORD $0xaa1f03e9 // mov x9, xzr WORD $0x14000019 // b .LBB69_8 LBB69_3: WORD $0xaa1f03e9 // mov x9, xzr WORD $0xcb00004a // sub x10, x2, x0 WORD $0xf100815f // cmp x10, #32 WORD $0x540002a3 // b.lo .LBB69_8 WORD $0xcb01004a // sub x10, x2, x1 WORD $0xf100815f // cmp x10, #32 WORD $0x54000243 // b.lo .LBB69_8 WORD $0x9240046a // and x10, x3, #0x3 WORD $0x9100400b // add x11, x0, #16 WORD $0xcb0a0109 // sub x9, x8, x10 WORD $0x9100402c // add x12, x1, #16 WORD $0x9100404d // add x13, x2, #16 WORD $0xaa0903ee // mov x14, x9 LBB69_6: WORD $0xad7f8981 // ldp q1, q2, [x12, #-16] WORD $0x9100818c // add x12, x12, #32 WORD $0xf10011ce // subs x14, x14, #4 WORD $0x3cdf0160 // ldur q0, [x11, #-16] WORD $0x6e61fc00 // fdiv v0.2d, v0.2d, v1.2d WORD $0x3cc20561 // ldr q1, [x11], #32 WORD $0x6e62fc21 // fdiv v1.2d, v1.2d, v2.2d WORD $0xad3f85a0 // stp q0, q1, [x13, #-16] WORD $0x910081ad // add x13, x13, #32 WORD $0x54fffee1 // b.ne .LBB69_6 WORD $0xb400018a // cbz x10, .LBB69_10 LBB69_8: WORD $0xd37df12c // lsl x12, x9, #3 WORD $0xcb090108 // sub x8, x8, x9 WORD $0x8b0c004a // add x10, x2, x12 WORD $0x8b0c002b // add x11, x1, x12 WORD $0x8b0c000c // add x12, x0, x12 LBB69_9: WORD $0xfc408580 // ldr d0, [x12], #8 WORD $0xfc408561 // ldr d1, [x11], #8 WORD $0xf1000508 // subs x8, x8, #1 WORD $0x1e611800 // fdiv d0, d0, d1 WORD $0xfc008540 // str d0, [x10], #8 WORD $0x54ffff61 // b.ne .LBB69_9 LBB69_10: WORD $0xd65f03c0 // ret golang-github-kelindar-simd-1.2.0/simd_stub.go000077500000000000000000000225241517522302000213070ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. //go:build noasm || !(amd64 || arm64) // +build noasm !amd64,!arm64 package simd import "unsafe" func init() { hardware = false } func _uint8_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint8_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint8_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint8_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint8_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint8_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint8_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint16_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint16_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint16_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint16_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint16_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint16_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint16_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _uint64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int8_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int8_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int8_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int8_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int8_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int8_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int8_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int16_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int16_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int16_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int16_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int16_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int16_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int16_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _int64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float32_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float32_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float32_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float32_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float32_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float32_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float32_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float64_sum(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float64_min(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float64_max(input unsafe.Pointer, result unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float64_add(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float64_sub(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float64_mul(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } func _float64_div(input1 unsafe.Pointer, input2 unsafe.Pointer, output unsafe.Pointer, size uint64) { panic("simd: assembly not available") } golang-github-kelindar-simd-1.2.0/simd_test.go000066400000000000000000000066071517522302000213120ustar00rootroot00000000000000// Copyright (c) Roman Atachiants and contributors. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. package simd import ( "fmt" "testing" "time" "github.com/stretchr/testify/assert" ) // Result represents a result of a benchmark type Result struct { Type string Name string Size int Rate float64 Speedup float64 } // makeVector generates a test vector func makeVector[T Number](count int) []T { arr := make([]T, count) for i := 0; i < count; i++ { arr[i] = T((i % 100) + 1) } return arr } // runBenchmark runs a benchmark and compares it with the baseline func runBenchmark(b *testing.B, typ, name string, size int, fn func(b *testing.B)) Result { rate0 := measure(b, fmt.Sprintf("%v-%v-%v-base", typ, name, size), "base", fn) rate1 := measure(b, fmt.Sprintf("%v-%v-%v-simd", typ, name, size), "simd", fn) return Result{ Type: typ, Name: name, Size: size, Rate: rate1, Speedup: rate0 / rate1, } } // measure measures the runtime func measure(b *testing.B, name, mode string, fn func(b *testing.B)) float64 { setMode(mode) // Run the benchmark and measure time and operations var start time.Time var ops int b.Run(name, func(b *testing.B) { start = time.Now() fn(b) ops = b.N }) // Calculate nanoseconds per operation (rate) return float64(time.Since(start)) / float64(ops) } // Iterates over all modes func rangeModes(fn func(mode string)) { for _, mode := range []string{"base", "simd"} { setMode(mode) fn(mode) } } // setMode changes the mode func setMode(mode string) { switch mode { case "simd": hardware = avx2 || apple || neon case "base": hardware = false } } func TestSum(t *testing.T) { assert.Equal(t, 3, int(Sum([]int8{1, 2}))) assert.Equal(t, 3, int(Sum([]int16{1, 2}))) assert.Equal(t, 3, int(Sum([]int32{1, 2}))) assert.Equal(t, 3, int(Sum([]int64{1, 2}))) assert.Equal(t, 3, int(Sum([]uint8{1, 2}))) assert.Equal(t, 3, int(Sum([]uint16{1, 2}))) assert.Equal(t, 3, int(Sum([]uint32{1, 2}))) assert.Equal(t, 3, int(Sum([]uint64{1, 2}))) assert.Equal(t, 3, int(Sum([]float32{1, 2}))) assert.Equal(t, 3, int(Sum([]float64{1, 2}))) assert.Equal(t, 3, int(Sum([]int{1, 2}))) } func TestMin(t *testing.T) { assert.Equal(t, 1, int(Min([]int8{3, 1, 2}))) assert.Equal(t, 1, int(Min([]int16{3, 1, 2}))) assert.Equal(t, 1, int(Min([]int32{3, 1, 2}))) assert.Equal(t, 1, int(Min([]int64{3, 1, 2}))) assert.Equal(t, 1, int(Min([]uint8{3, 1, 2}))) assert.Equal(t, 1, int(Min([]uint16{3, 1, 2}))) assert.Equal(t, 1, int(Min([]uint32{3, 1, 2}))) assert.Equal(t, 1, int(Min([]uint64{3, 1, 2}))) assert.Equal(t, 1, int(Min([]float32{3, 1, 2}))) assert.Equal(t, 1, int(Min([]float64{3, 1, 2}))) assert.Equal(t, 1, int(Min([]int{3, 1, 2}))) } func TestMax(t *testing.T) { assert.Equal(t, 2, int(Max([]int8{1, 2}))) assert.Equal(t, 2, int(Max([]int16{1, 2}))) assert.Equal(t, 2, int(Max([]int32{1, 2}))) assert.Equal(t, 2, int(Max([]int64{1, 2}))) assert.Equal(t, 2, int(Max([]uint8{1, 2}))) assert.Equal(t, 2, int(Max([]uint16{1, 2}))) assert.Equal(t, 2, int(Max([]uint32{1, 2}))) assert.Equal(t, 2, int(Max([]uint64{1, 2}))) assert.Equal(t, 2, int(Max([]float32{1, 2}))) assert.Equal(t, 2, int(Max([]float64{1, 2}))) assert.Equal(t, 2, int(Max([]int{1, 2}))) }